-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathJASSjr_index.rs
217 lines (199 loc) · 7.17 KB
/
JASSjr_index.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/*
JASSjr_index.rs
---------------
Copyright (c) 2024 Katelyn Harlan
Minimalistic BM25 search engine.
*/
use std::io::{BufRead, BufReader, BufWriter, Write, Seek};
use std::fs::File;
use std::convert::TryFrom;
use std::env;
use std::collections::HashMap;
/*
main()
------
Simple indexer for TREC WSJ collection
*/
fn main() -> std::io::Result<()> {
/*
Make sure we have one parameter, the filename
*/
let args: Vec<_> = env::args().collect();
if args.len() != 2 {
println!("Usage: {} <infile.xml>", args[0]);
std::process::exit(1);
}
let mut vocab: HashMap<String,Vec<(i32,i32)>> = HashMap::new();
let mut doc_ids: Vec<String> = Vec::new();
let mut doc_lengths: Vec<i32> = Vec::new();
let mut docid = -1;
let mut document_length = 0;
let mut push_next = false; // is the next token the primary key?
{
/*
Open the file to index
*/
let file = File::open(&args[1])?;
let mut reader = BufReader::new(file);
let mut buffer = String::new();
let mut token = String::new();
while reader.read_line(&mut buffer)? > 0 {
let mut chars = buffer.chars();
while let Some(c) = chars.next() {
/*
A token is either an XML tag '<'..'>' or a sequence of alphanumerics
*/
if !c.is_alphanumeric() {
//TREC <DOCNO> primary keys have a hyphen in them
if c == '-' && token.len() > 0 {
token.push(c);
continue;
}
if token.len() <= 0 && c != '<' {
continue;
}
//We are in a closing tag, not ending a token
if c == '/' && token.starts_with('<') {
token.push(c);
continue;
}
//We are at the end of a tag
if c == '>' && token.starts_with('<'){
token.push(c);
}
/*
If we see a <DOC> tag then we're at the start of the next document
*/
if token == "<DOC>" {
/*
Save the previous document length
*/
if docid != -1 {
doc_lengths.push(document_length);
}
/*
Move on to the next document
*/
docid +=1;
document_length = 0;
if docid % 1000 == 0 {
println!("{} documents indexed", docid);
}
}
/*
If the last token we saw was a <DOCNO> then the next token is the primary key
*/
if push_next {
doc_ids.push(token.clone());
push_next = false;
}
if token == "<DOCNO>" {
push_next = true;
}
/*
Don't index XML tags
*/
if !token.starts_with('<') && token.len() > 0 {
/*
Lowercase the string
*/
token = token.to_lowercase();
/*
Truncate long tokens at 255 characters (so that the length can be stored first and in a single byte)
*/
token.truncate(255);
/*
Add the posting to the in-memory index
*/
if vocab.contains_key(&token) {
let posting_list: &mut Vec<(i32,i32)> = vocab.get_mut(&token).unwrap();
let num_postings = posting_list.len() -1;
if posting_list[num_postings].0 == docid { //increase the tf
posting_list[num_postings].1 += 1;
} else { //if the docno for this occurrence has changed then create a new <d,tf> pair
posting_list.push((docid,1));
}
}else { //if the term isn't in the vocab yet
let posting_list = vec![(docid,1)];
vocab.insert(token.clone(), posting_list);
}
/*
Compute the document length
*/
document_length +=1;
}
token.clear();
if c == '<' {
token.push(c);
}
}else {
token.push(c);
}
}
buffer.clear();
}
}
/*
If we didn't index any documents then we're done
*/
if docid == -1 {
return Ok(());
}
/*
Tell the user we've got to the end of parsing
*/
println!("Indexed {} documents. Serialising...", docid+1);
/*
Save the final document length
*/
doc_lengths.push(document_length);
/*
Store the primary keys
*/
{
let mut writer = BufWriter::new(File::create("docids.bin")?);
for id in doc_ids {
writer.write_all(id.as_bytes())?;
writer.write_all(b"\n")?;
}
writer.flush()?;
}
/*
Serialise the in-memory index to disk
*/
{
let mut postings_writer = BufWriter::new(File::create("postings.bin")?);
let mut vocab_writer = BufWriter::new(File::create("vocab.bin")?);
for (term, postings_list) in &vocab {
/*
Write the postings list to one file
*/
let offset: i32 = i32::try_from(postings_writer.stream_position()?).unwrap();
for posting in postings_list {
postings_writer.write_all(&posting.0.to_ne_bytes())?;
postings_writer.write_all(&posting.1.to_ne_bytes())?;
}
/*
Write the vocabulary to second file (one byte length, string, '\0', 4 byte where, 4 byte size)
*/
vocab_writer.write_all(&[u8::try_from(term.len()).unwrap()])?;
vocab_writer.write_all(term.as_bytes())?;
vocab_writer.write_all(b"\0")?;
vocab_writer.write_all(&offset.to_ne_bytes())?;
vocab_writer.write_all(&i32::try_from(&postings_list.len() * 2 * 4).unwrap().to_ne_bytes())?;
}
postings_writer.flush()?;
vocab_writer.flush()?;
}
/*
Store the document lengths
*/
{
let mut writer = BufWriter::new(File::create("lengths.bin")?);
for length in &doc_lengths {
writer.write_all(&length.to_ne_bytes())?;
}
writer.flush()?;
}
Ok(())
}