-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathindex.go
120 lines (106 loc) · 3.47 KB
/
index.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// Copyright 2019 PaperCut Software International Pty Ltd. All rights reserved.
package main
import (
"flag"
"fmt"
"os"
"path/filepath"
"runtime/pprof"
"time"
"github.com/papercutsoftware/pdfsearch"
"github.com/papercutsoftware/pdfsearch/examples/cmd_utils"
)
const usage = `Usage: go run index.go [OPTIONS] pcng-manual*.pdf
Adds PDFs that match "pcng-manual*.pdf" to the index.
`
func main() {
persistDir := filepath.Join(pdfsearch.DefaultPersistRoot, "my.computer")
doCPUProfile := false
flag.StringVar(&persistDir, "s", persistDir, "The on-disk index is stored here.")
flag.BoolVar(&doCPUProfile, "p", doCPUProfile, "Do Go CPU profiling.")
cmd_utils.MakeUsage(usage)
cmd_utils.MakeUsage(usage)
flag.Parse()
pdfsearch.InitLogging()
if len(flag.Args()) < 1 {
flag.Usage()
os.Exit(1)
}
// Read the files to index into `pathList`.
pathList, err := cmd_utils.PatternsToPaths(flag.Args())
if err != nil {
fmt.Fprintf(os.Stderr, "PatternsToPaths failed. args=%#q err=%v\n", flag.Args(), err)
os.Exit(1)
}
pathList = cmd_utils.CleanCorpus(pathList)
// pathList = pathList[7700:]
if len(pathList) < 1 {
fmt.Fprintf(os.Stderr, "No files matching %q.\n", flag.Args())
os.Exit(1)
}
pathList = cmd_utils.PartShuffle(pathList)
if doCPUProfile {
profilePath := "cpu.index.prof"
fmt.Printf("Profiling to %s\n", profilePath)
f, err := os.Create(profilePath)
if err != nil {
fmt.Printf("Error: %v\n", err)
os.Exit(1)
}
defer f.Close()
err = pprof.StartCPUProfile(f)
if err != nil {
fmt.Printf("Error: %v\n", err)
os.Exit(1)
}
defer pprof.StopCPUProfile()
}
// Run the tests.
if err := runIndexShow(pathList, persistDir); err != nil {
fmt.Fprintf(os.Stderr, "runIndexShow failed. err=%v\n", err)
os.Exit(1)
}
}
// runIndexShow creates a pdfsearch.PdfIndex for the PDFs in `pathList`, searches for `term` in this
// index, and shows the results.
// `persistDir`: The directory the pdfsearch.PdfIndex is saved.
func runIndexShow(pathList []string, persistDir string) error {
pdfIndex, dt, err := runIndex(pathList, persistDir)
if err != nil {
return err
}
return showIndex(pathList, pdfIndex, dt)
}
// runIndex creates a pdfsearch.PdfIndex for the PDFs in `pathList` and returns the
// pdfsearch.PdfIndex, the search results and the indexing duration.
// The pdfsearch.PdfIndex is saved in directory `persistDir`.
// This is the main function. It shows you how to create or open an index.
func runIndex(pathList []string, persistDir string) (pdfIndex pdfsearch.PdfIndex, dt time.Duration,
err error) {
fmt.Fprintf(os.Stderr, "Indexing %d files. Index stored in %q.\n", len(pathList), persistDir)
t0 := time.Now()
pdfIndex, err = pdfsearch.IndexPdfFiles(pathList, persistDir, report)
if err != nil {
return pdfIndex, dt, err
}
dt = time.Since(t0)
return pdfIndex, dt, nil
}
// showIndex writes a report on `pdfIndex` that was build from the PDFs in `pathList`.
// `dt` is the duration of the indexing.
func showIndex(pathList []string, pdfIndex pdfsearch.PdfIndex, dt time.Duration) error {
numFiles := pdfIndex.NumFiles()
numPages := pdfIndex.NumPages()
pagesSec := 0.0
if dt.Seconds() >= 0.01 {
pagesSec = float64(numPages) / dt.Seconds()
}
fmt.Fprintf(os.Stderr, "%d pages from %d PDFs in %.1f secs (%.1f pages/sec)\n",
numPages, numFiles, dt.Seconds(), pagesSec)
fmt.Fprintf(os.Stderr, "%s\n", pdfIndex)
return nil
}
// `report` is called by IndexPdfMem to report progress.
func report(msg string) {
fmt.Fprintf(os.Stderr, ">> %s\n", msg)
}