-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
115 lines (100 loc) · 3.98 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
INDEX_DIR = "IndexFiles"
import re
import sys, os, lucene, threading, time
from datetime import datetime
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import \
FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import SimpleFSDirectory
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__(self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(1.0)
class IndexFiles(object):
"""Usage: python IndexFiles <doc_directory>"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = SimpleFSDirectory(Paths.get(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexDocs(root, writer)
ticker = Ticker()
print ('commit index',)
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
ticker.tick = False
print('done')
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setStored(True)
t2.setTokenized(True)
t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
itr=100
for root, dirnames, filenames in os.walk(root):
# with open('output.txt', 'w') as output:
# for f in filenames:
# output.write(str(str(itr) + ' ' + f + "\n"))
# print('adding file name: ', f)
# itr += 1
for filename in filenames:
if not filename.endswith('.xml'):
continue
print ("adding", filename)
try:
path = os.path.join(root, filename)
file = open(path,encoding='utf-8')
contents = file.read()
# titles = str(re.findall("<Title>(.*?)</Title>", contents))
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
# if len(titles) > 0:
# doc.add(Field("titles", titles, t2))
else:
print ("warning: no content in %s" % filename)
writer.addDocument(doc)
except Exception as e:
print ("Failed in indexDocs:", e)
if __name__ == '__main__':
if len(sys.argv) < 2:
print (IndexFiles.__doc__)
sys.exit(1)
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print ('lucene', lucene.VERSION)
start = datetime.now()
try:
base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
IndexFiles(sys.argv[1], os.path.join(base_dir, INDEX_DIR),
StandardAnalyzer())
end = datetime.now()
print (end - start)
except Exception as e:
print ("Failed: ", e)
raise e