-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript_for_saving_pdffields.py
71 lines (55 loc) · 2.03 KB
/
script_for_saving_pdffields.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 5 11:59:37 2021
@author: lakna
"""
#Required Library's
import os
import gensim
import sklearn
import random
import pdfplumber
import csv
# nltk related libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = stopwords.words('english')
from tika import parser
from tika import unpack
import pandas as pd
import json
def cleaning():
permitdict=dict()
for permit in os.listdir():
parsed = parser.from_file(permit)
line=parsed["content"]
try:
tokens = nltk.word_tokenize(line)
words_an=[word.lower() for word in tokens if word.isalpha()]
words=[t for t in words_an if not t in stopwords.words("english")]
permitdict[permit]=words
except TypeError:
print("The following PDF could not be parsed\n",permit)
return(permitdict)
root="C:/Users/lakna/OneDrive/Desktop/CityGrows/similarity_engine/"
boardsdir=root+"boards_and_comissions"
os.chdir(boardsdir)
boardsdata=cleaning()
unclean_data=pd.DataFrame.from_dict(boardsdata, orient='index')
unclean_data.to_csv("C:/Users/lakna/OneDrive/Desktop/CityGrows/similarity_engine/raw_datafields/boardsdata.csv")
buildingdir=root+"building_permits"
os.chdir(buildingdir)
buildingdata=cleaning()
unclean_data=pd.DataFrame.from_dict(buildingdata, orient='index')
unclean_data.to_csv("C:/Users/lakna/OneDrive/Desktop/CityGrows/similarity_engine/raw_datafields/buildingdata.csv")
bizlicence = root+"business_licence"
os.chdir(bizlicence)
bizdata=cleaning()
unclean_data=pd.DataFrame.from_dict(bizdata, orient='index')
unclean_data.to_csv("C:/Users/lakna/OneDrive/Desktop/CityGrows/similarity_engine/raw_datafields/bizdata.csv")
doglicence = root+"dog_licence"
os.chdir(doglicence)
dogdata=cleaning()
unclean_data=pd.DataFrame.from_dict(dogdata, orient='index')
unclean_data.to_csv("C:/Users/lakna/OneDrive/Desktop/CityGrows/similarity_engine/raw_datafields/dogdata.csv")