-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddner.py
40 lines (31 loc) · 1.69 KB
/
addner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import argparse
import spacy
from tqdm import tqdm
# Load the French language model for spaCy
nlp = spacy.load("fr_core_news_sm")
# Increase the max_length limit
nlp.max_length = 2000000 # Set it to a higher value as per your requirement
def extract_ner_and_append(input_file, output_file):
# Open the input file for reading and the output file for writing
with open(input_file, 'r', encoding='utf-8') as f_input, open(output_file, 'w', encoding='utf-8') as f_output:
# Process each line in the input file
for line in tqdm(f_input, desc="Processing"):
# Process the text of the current line using spaCy
doc = nlp(line)
# Initialize an empty string to hold the modified text for this line
modified_text = ""
# Iterate through each token in the processed text
for token in doc:
# Append the token text with its NER tag, skip spaces and newline characters
if token.text.strip():
modified_text += token.text + "|" + token.ent_type_ + " " if token.ent_type_ else token.text + " "
# Write the modified text with NER tags appended to the output file
f_output.write(modified_text.rstrip() + '\n')
if __name__ == "__main__":
# Set up command-line argument parsing
parser = argparse.ArgumentParser(description="Extract NER and append to text")
parser.add_argument("input_file", type=str, help="Path to the input file")
parser.add_argument("output_file", type=str, help="Path to save the modified file")
args = parser.parse_args()
# Call the function with provided file paths
extract_ner_and_append(args.input_file, args.output_file)