-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreproceesing.py
65 lines (47 loc) · 1.82 KB
/
Preproceesing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pickle
import re
from suffixtree import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Stage 1 Filtering (Don't Run).
def read_filter_write(file, source_dir, target_dir, un_dir):
with open(source_dir + un_dir + '/' + file, 'r') as f:
data = f.read().replace('\n', ' ')
data = filtering(data)
with open(target_dir + un_dir + '/' + file, "w") as f:
f.write(data)
return data
# Removing stopword english numericals, keeping only unicode of devanagri, removing devanagri numericals,
# removing extra white spaces
def filtering(data):
patterns = {r'<.*>+': '',
r'[!@#$%^&*()_+<>|,.:;()+=…&×{}<>"→?\'0-9।-]': '',
r'[^\u0900-\u097F ]': '',
r'[\u0964-\u096F]': '',
r'[\s+]': ' '}
for pattern, result in patterns.items():
data = re.sub(pattern, result, data)
return data
source_dir = 'Unfiltered/'
unfiltered_dirs = os.listdir(source_dir)
target_dir = 'Filtered/'
for un_dir in unfiltered_dirs:
files = os.listdir(source_dir + un_dir)
for file in files:
read_filter_write(file, source_dir, target_dir, un_dir)
# Stage 2 Reading filtered files for each language and storing only unique words into a new folder. (Don't Run)
# Keep unique words with len > 2 as pickled lists
filtered_dirs = os.listdir(target_dir)
for f_dir in filtered_dirs:
files = os.listdir(target_dir + f_dir)
word_list = []
for file in files:
with open(target_dir + f_dir + '/' + file, 'r') as f:
data = f.read()
words = data.split(' ')
words = list((filter(None, words)))
word_list += [word for word in words if len(word) > 2]
word_list = list(set(word_list))
with open('Words List/' + f_dir + '.pkl', 'wb') as f:
pickle.dump(word_list, f)