Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fallback-encoding per-repo option for non-utf8 text files #388

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions codesearch/index/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package index

import (
"errors"
"fmt"
"io"
"io/ioutil"
Expand All @@ -14,6 +15,7 @@ import (
"unsafe"

"github.com/hound-search/hound/codesearch/sparse"
"golang.org/x/text/encoding"
)

// Index writing. See read.go for details of on-disk format.
Expand Down Expand Up @@ -123,17 +125,17 @@ func (ix *IndexWriter) AddFile(name string) {
func (ix *IndexWriter) Add(name string, f io.Reader) string {
ix.trigram.Reset()
var (
c = byte(0) //nolint
c = byte(0) //nolint
i = 0
buf = ix.inbuf[:0]
tv = uint32(0)
n = int64(0)
linelen = 0
numLines = 0
longLines = 0
skipReason = "" //nolint
skipReason = "" //nolint
)

const invalidUTF8 = "Invalid UTF-8"
for {
tv = (tv << 8) & (1<<24 - 1)
if i >= len(buf) {
Expand All @@ -144,6 +146,9 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string {
break
}
log.Printf("%s: %v\n", name, err)
if errors.Is(err, encoding.ErrInvalidUTF8) {
return invalidUTF8
}
return ""
}
log.Printf("%s: 0-length read\n", name)
Expand All @@ -159,7 +164,7 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string {
ix.trigram.Add(tv)
}
if !validUTF8((tv>>8)&0xFF, tv&0xFF) {
skipReason = "Invalid UTF-8"
skipReason = invalidUTF8
if ix.LogSkip {
log.Printf("%s: %s\n", name, skipReason)
}
Expand Down Expand Up @@ -246,7 +251,7 @@ func (ix *IndexWriter) Flush() {

os.Remove(ix.nameData.name)
for _, d := range ix.postData {
unmmap(d) //nolint
unmmap(d) //nolint
}
for _, f := range ix.postFile {
f.Close()
Expand Down Expand Up @@ -310,7 +315,7 @@ func (ix *IndexWriter) flushPost() {
}

ix.post = ix.post[:0]
w.Seek(0, 0) //nolint
w.Seek(0, 0) //nolint
ix.postFile = append(ix.postFile, w)
}

Expand Down Expand Up @@ -368,7 +373,7 @@ type postChunk struct {
m []postEntry // remaining entries after e
}

const postBuf = 4096 //nolint
const postBuf = 4096 //nolint

// A postHeap is a heap (priority queue) of postChunks.
type postHeap struct {
Expand All @@ -388,7 +393,7 @@ func (h *postHeap) addMem(x []postEntry) {

// step reads the next entry from ch and saves it in ch.e.
// It returns false if ch is over.
func (h *postHeap) step(ch *postChunk) bool { //nolint
func (h *postHeap) step(ch *postChunk) bool { //nolint
old := ch.e
m := ch.m
if len(m) == 0 {
Expand All @@ -414,7 +419,7 @@ func (h *postHeap) add(ch *postChunk) {
}

// empty reports whether the postHeap is empty.
func (h *postHeap) empty() bool { //nolint
func (h *postHeap) empty() bool { //nolint
return len(h.ch) == 0
}

Expand Down Expand Up @@ -492,7 +497,7 @@ type bufWriter struct {
name string
file *os.File
buf []byte
tmp [8]byte //nolint
tmp [8]byte //nolint
}

// bufCreate creates a new file with the given name and returns a
Expand Down Expand Up @@ -578,7 +583,7 @@ func (b *bufWriter) flush() {
func (b *bufWriter) finish() *os.File {
b.flush()
f := b.file
f.Seek(0, 0) //nolint
f.Seek(0, 0) //nolint
return f
}

Expand Down
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type Repo struct {
ExcludeDotFiles bool `json:"exclude-dot-files"`
EnablePollUpdates *bool `json:"enable-poll-updates"`
EnablePushUpdates *bool `json:"enable-push-updates"`
FallbackEncoding string `json:"fallback-encoding"`
}

// Used for interpreting the config value for fields that use *bool. If a value
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ go 1.13
require (
github.com/blang/semver v3.5.1+incompatible
github.com/go-bindata/go-bindata v3.1.2+incompatible // indirect
golang.org/x/text v0.3.5
)
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnweb
github.com/go-bindata/go-bindata v1.0.0 h1:DZ34txDXWn1DyWa+vQf7V9ANc2ILTtrEjtlsdJRF26M=
github.com/go-bindata/go-bindata v3.1.2+incompatible h1:5vjJMVhowQdPzjE1LdxyFF7YFTXg5IgGVW4gBr5IbvE=
github.com/go-bindata/go-bindata v3.1.2+incompatible/go.mod h1:xK8Dsgwmeed+BBsSy2XTopBn/8uK2HWuGSnA11C3Joo=
golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
68 changes: 51 additions & 17 deletions index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (

"github.com/hound-search/hound/codesearch/index"
"github.com/hound-search/hound/codesearch/regexp"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)

const (
Expand All @@ -38,6 +40,7 @@ type Index struct {
type IndexOptions struct {
ExcludeDotFiles bool
SpecialFiles []string
FallbackEnc encoding.Encoding
}

type SearchOptions struct {
Expand Down Expand Up @@ -236,12 +239,12 @@ func (n *Index) Search(pat string, opt *SearchOptions) (*SearchResponse, error)
Matches: results,
FilesWithMatch: filesFound,
FilesOpened: filesOpened,
Duration: time.Now().Sub(startedAt), //nolint
Duration: time.Now().Sub(startedAt), //nolint
Revision: n.Ref.Rev,
}, nil
}

func isTextFile(filename string) (bool, error) {
func isTextFile(filename string) (isText bool, err error) {
buf := make([]byte, filePeekSize)
r, err := os.Open(filename)
if err != nil {
Expand All @@ -256,14 +259,14 @@ func isTextFile(filename string) (bool, error) {

buf = buf[:n]

if n < filePeekSize {
// read the whole file, must be valid.
return utf8.Valid(buf), nil
if n < filePeekSize && utf8.Valid(buf) || // read the whole file, must be valid.
n >= filePeekSize && validUTF8IgnoringPartialTrailingRune(buf) { // read a prefix, allow trailing partial runes.
return true, nil
}

// read a prefix, allow trailing partial runes.
return validUTF8IgnoringPartialTrailingRune(buf), nil

if isBinary(buf) {
return false, nil
}
return true, nil
}

// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed
Expand Down Expand Up @@ -292,28 +295,59 @@ func validUTF8IgnoringPartialTrailingRune(p []byte) bool {
return true
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) {
func isBinary(p []byte) bool {
for _, c := range p {
if c < 10 {
return true
}
}
return false
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string, fallbackEnc encoding.Encoding) (string, error) {
rel, err := filepath.Rel(src, path)
if err != nil {
return "", err
}

r, err := os.Open(path)
fh, err := os.Open(path)
if err != nil {
return "", err
}
defer r.Close()
defer fh.Close()

dup := filepath.Join(dst, "raw", rel)
w, err := os.Create(dup)
if err != nil {
return "", err
}
defer w.Close()

g := gzip.NewWriter(w)
defer g.Close()
r := io.Reader(fh)

// Without fallback encoding, assume UTF-8.
maybeValidated := r
if fallbackEnc != nil {
maybeValidated = transform.NewReader(r, encoding.UTF8Validator)
}
skipReason := ix.Add(rel, io.TeeReader(maybeValidated, g))
if fallbackEnc == nil || skipReason == "" || skipReason != "Invalid UTF-8" {
return skipReason, nil
}

// Reset, then try the fallback encoding.
if _, err = fh.Seek(0, 0); err != nil {
return skipReason, err
}
if _, err = w.Seek(0, 0); err != nil {
return skipReason, err
}
if err = w.Truncate(0); err != nil {
return skipReason, err
}
g.Reset(w)
r = fallbackEnc.NewDecoder().Reader(r)
return ix.Add(rel, io.TeeReader(r, g)), nil
}

Expand Down Expand Up @@ -364,7 +398,7 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error {
}
defer fileHandle.Close()

if err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { //nolint
if err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { //nolint
name := info.Name()
rel, err := filepath.Rel(src, path)
if err != nil {
Expand Down Expand Up @@ -404,20 +438,20 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error {
return nil
}

txt, err := isTextFile(path)
isText, err := isTextFile(path)
if err != nil {
return err
}

if !txt {
if !isText {
excluded = append(excluded, &ExcludedFile{
rel,
reasonNotText,
})
return nil
}

reasonForExclusion, err := addFileToIndex(ix, dst, src, path)
reasonForExclusion, err := addFileToIndex(ix, dst, src, path, opt.FallbackEnc)
if err != nil {
return err
}
Expand Down
37 changes: 35 additions & 2 deletions index/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ import (
"path/filepath"
"runtime"
"testing"

"github.com/hound-search/hound/codesearch/index"
"golang.org/x/text/encoding/charmap"
)

const (
Expand Down Expand Up @@ -35,7 +38,7 @@ func TestSearch(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer ref.Remove() //nolint
defer ref.Remove() //nolint

// Make sure the metadata in the ref is good.
if ref.Rev != rev {
Expand Down Expand Up @@ -79,7 +82,7 @@ func TestRead(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer ref.Remove() //nolint
defer ref.Remove() //nolint

r, err := Read(ref.Dir())
if err != nil {
Expand All @@ -100,3 +103,33 @@ func TestRead(t *testing.T) {
}
defer idx.Close()
}

func TestFallbackEnc(t *testing.T) {
dst, err := ioutil.TempDir(os.TempDir(), "hound")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dst)
os.MkdirAll(filepath.Join(dst, "raw"), 0701)

ix := index.Create(filepath.Join(dst, "tri"))
defer ix.Close()

// { for i in $(seq 0 $(( 2048 / 43 ))); do echo '2048 byte of ASCII to fill the peek buffer'; done; echo ''; echo 'árvíztűrő tükörfúrógép' |iconv -f UTF8 -t ISO8859-2; } > testdata/iso8859_2.txt'))
const src = "testdata"
const path = "iso8859_2.txt"
skipReason, err := addFileToIndex(ix, dst, src, filepath.Join(src, path), nil)
if err != nil {
t.Fatal(err)
}
if skipReason == "" {
t.Error("wanted skip, got success without fallback encoding")
}
skipReason, err = addFileToIndex(ix, dst, src, filepath.Join(src, path), charmap.ISO8859_2)
if err != nil {
t.Fatal(err)
}
if skipReason != "" {
t.Errorf("wanted success, got skip %q", skipReason)
}
}
Loading