-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_shaper.py
executable file
·140 lines (110 loc) · 3.71 KB
/
text_shaper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# import python libs
import re
import json
import argparse
from os import listdir
from os.path import isfile, join
from pprint import pprint as pp
# import project libs
import sys
sys.path.append('lib')
import ner_pipeline
import nltk_tree_converter
# defining globals & constants
# -
def shape(raw_text):
# simplify quotes
raw_text = re.sub("``", '"', raw_text)
raw_text = re.sub("''", '"', raw_text)
# create a list of strings
sentences = ner_pipeline.sentence_splitting(raw_text)
# create a list of lists of strings
tokenized_sentences = [word_tokenization(sentence) for sentence in sentences]
sentences = []
for sentence in tokenized_sentences:
tokens = []
for word in sentence:
token = {
'term': word
}
tokens.append(token)
sentences.append(tokens)
return sentences
def word_tokenization(sentence):
tokens = ner_pipeline.word_tokenization(sentence)
# split mult-word-tokens
for index, _ in enumerate(tokens):
token = tokens[index]
if len(token) >= 3 and '-' in token:
sub_tokens = token.split('-')
del tokens[index]
tokens[index:index] = intersperse(sub_tokens, '-')
return tokens
def intersperse(lst, item):
result = [item] * (len(lst) * 2 - 1)
result[0::2] = lst
return result
def raw_data_json_from(shaped_sentences, input_file_name):
raw_datum = {
'id': input_file_name,
'data': shaped_sentences
}
return json.dumps(raw_datum)
def read_input_file(file_handler):
global input_file_name
input_file_name = file_handler.name
content = file_handler.read()
file_handler.close()
# return a list of paragraphs
return content.split("\n\n")
def save_to_file(json, file_handler):
file_handler.write(json)
file_handler.close()
def iterate_plain_paragraphs(paragraphs):
shaped_paragraphs = []
for paragraph in paragraphs:
sentences = shape(paragraph)
shaped_paragraphs.append(sentences)
return shaped_paragraphs
# entry point as a stand alone script
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Dalphi Iterate Service text shaper; converts plain text to raw data')
parser.add_argument(
'-i',
'--input',
type=argparse.FileType('r')
)
parser.add_argument(
'-o',
'--output',
type=argparse.FileType('w')
)
parser.add_argument(
"-id",
"--input_dir",
help="to shape all files in the input directory")
parser.add_argument(
"-od",
"--output_dir",
help="to shape all files in the input directory")
args = parser.parse_args()
if args.input_dir and args.output_dir:
path = args.input_dir
for file_name in listdir(path):
if not (isfile(join(path, file_name)) and file_name.endswith('.txt')): continue
file_handler = open(path + file_name, 'r', encoding='utf-8')
paragraphs = read_input_file(file_handler)
shaped_paragraphs = iterate_plain_paragraphs(paragraphs)
json_object = raw_data_json_from(shaped_paragraphs, file_name)
file_handler = open(args.output_dir + file_name + '.json', 'w', encoding='utf-8')
save_to_file(json_object, file_handler)
elif args.input and args.output:
paragraphs = read_input_file(args.input)
shaped_paragraphs = iterate_plain_paragraphs(paragraphs)
json_object = raw_data_json_from(shaped_paragraphs, args.input.name)
save_to_file(json_object, args.output)
else:
print('specify input and output (help: -h)')