-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgt_20221004_files_exer1.py
77 lines (59 loc) · 2.85 KB
/
sgt_20221004_files_exer1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# Exercise
# read text from sherlock_holmes_adventures.txt
# 1a -> write the function file_line_len(fpath), which returns the number of lines in the file
# check file_line_len("sherlock_holmes_adventures.txt") -> 12305
# 1b -> write the function get_text_lines(fpath), which returns a list with only those lines that contain text.
# So we want to avoid/filter out those lines that contain whitespace
# PS preferably use encoding = "utf-8" when reading
# 1c -> write the function save_lines(destpath, lines)
# This function will store all lines into destpath file
# 1d -> call save_lines with destpath being "pure_sherlock.txt" and lines being the text lines we cleaned from 1b
# 1e -> write the function clean_punkts(srcpath, destpath)
# function will open the srcpath file, clear it from https://docs.python.org/3/library/string.html#string.punctuation
# then function will save the cleaned text into destpath
# clean_punkts("pure_sherlock.txt", "clean_sherlock.txt")
# 1f -> write the function get_word_usage(srcpath, destpath)
# The function opens the file and finds the most frequently used words
# recommendation to use Counter module!
# assume that the words are separated by either whitespace or newline (the good old split will come in handy)
# the saved list of words and frequency should be saved in destpath in the following form:
# word, count
# un, 3423
# es, 3242
# in effect you will be saving in standard csv format - https://docs.python.org/3/library/csv.html
# you can use csv module for this, but it is not necessary
#1a
from pathlib import Path
from matplotlib import lines
def file_line_len():
with open("sherlock_holmes_adventures.txt", encoding="utf-8") as fstream:
return len(fstream.readlines())
# print(file_line_len())
#1b
fpath = Path("sherlock_holmes_adventures.txt")
def get_text_lines(fpath):
with open(fpath, encoding="utf-8") as f:
lines = [line.rstrip() for line in f if line.strip()]
return lines
# print(get_text_lines(fpath))
text_lines = get_text_lines(fpath)
#1c_d
def save_lines(destpath, lines, sep='\n', encoding='utf-8'):
with open(destpath, "w", encoding=encoding) as f:
for line in lines:
f.write(line + sep)
save_lines("pure_sherlock_b.txt", text_lines)
text_lines_with_newlines = [line + '\n' for line in text_lines]
save_lines("pure_sherlock.txt", text_lines_with_newlines, sep="")
#1e
import string
def clean_punkts(srcpath, destpath):
with open(srcpath, encoding="utf-8") as fin, open(destpath, mode="w", encoding="utf-8") as fout:
for line in fin:
for character in line:
if character in string.punctuation: # you could skip this if and just use replace
line = line.replace(character, '')
fout.write(line)
srcpath='pure_sherlock.txt'
destpath='clean_sherlock.txt'
clean_punkts(srcpath, destpath)