-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_histo.py
49 lines (40 loc) · 1.2 KB
/
text_histo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# text_histo.py
import string
from itertools import islice
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
def process_file(filename):
hist = dict()
fp = open(filename, encoding='utf-8')
for line in fp:
process_line(line, hist)
return hist
def process_line(line, hist):
line = line.replace('-', ' ')
for word in line.split():
word = word.strip(string.punctuation + string.whitespace)
word = word.lower()
hist[word] = hist.get(word, 0) + 1
def total_words(hist):
return sum(hist.values())
def different_words(hist):
return len(hist)
def most_common(hist):
t = []
for key, value in hist.items():
t.append((value, key))
t.sort(reverse=True)
return t
def print_most_common(hist, num=10):
t = most_common(hist)
print(f'The most common {num} words are:')
for freq, word in t[:num]:
print(word, freq, sep='\t')
hist = process_file('emma.txt')
n_items = take(5, hist.items())
print(n_items)
print('Total number of words:', total_words(hist))
print('Number of different words:', different_words(hist))
print_most_common(hist)
print_most_common(hist, 20)