-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcapitolo13.py
145 lines (127 loc) · 3.82 KB
/
capitolo13.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import random, string
from dizionari import istogramma
def prepara_testo(in_text, out_text):
file_in = open(in_text)
file_out = open(out_text, 'w')
line = 1
for riga in file_in:
if line < 250:
line += 1
else:
riga = riga.lower()
riga = riga.replace('.', '')
riga = riga.replace(',', '')
riga = riga.replace(';', '')
riga = riga.replace(':', '')
riga = riga.replace('!', '')
riga = riga.replace('?', '')
riga = riga.replace('(', '')
riga = riga.replace(')', '')
riga = riga.replace('"', '')
riga = riga.replace('“', '')
riga = riga.replace('”', '')
riga = riga.replace('«', '')
riga = riga.replace('»', '')
riga = riga.replace('…', '')
riga = riga.replace('\'', '')
riga = riga.replace('--', ' ')
riga = riga.replace('‘', ' ')
riga = riga.replace('’', ' ')
file_out.write(riga)
file_out.close()
def contaparole(in_file):
fin = open(in_file)
contaparole = 0
for riga in fin:
lista = riga.split()
contaparole += len(lista)
return contaparole
def vocabolario(in_file):
fin = open(in_file)
vocabolario = dict()
for riga in fin:
lista = riga.split()
for parola in lista:
vocabolario[parola] = vocabolario.get(parola, 0) + 1
return vocabolario
def top20(in_file):
vocab = vocabolario(in_file)
top20 = []
for item in vocab:
if vocab[item] in sorted(vocab.values())[-20:]:
top20.append({item:vocab[item]})
return top20
def extravocab(text, vocab):
parole_vocab = []
parole_extra = []
fin = open(vocab)
for riga in fin:
parola = riga.strip()
parole_vocab.append(parola)
txt = open(text)
for riga in txt:
lista = riga.split()
for parola in lista:
if parola not in parole_vocab:
parole_extra.append(parola)
return parole_extra
def estrai(stringa):
isto = istogramma(stringa)
isto_lista = []
for elem in isto:
isto_lista.extend([elem for i in range(isto[elem])])
return random.choice(isto_lista)
def elabora_file(nomefile):
isto = dict()
input = open(nomefile)
for riga in input:
elabora_riga(riga, isto)
return isto
def elabora_riga(riga, isto):
riga = riga.replace('-', ' ')
for parola in riga.split():
parola = parola.strip(string.punctuation + string.whitespace)
parola = parola.lower()
isto[parola] = isto.get(parola, 0) + 1
isto = elabora_file('out.txt')
def più_comuni(isto):
t = []
for chiave, valore in isto.items():
t.append((valore, chiave))
t.sort(reverse=True)
return t
def stampa_più_comuni(isto, num=10):
t = più_comuni(isto)
print('Le parole più comuni sono:')
for freq, parola in t[:num]:
print(parola, freq, sep='\t')
def sottrai(d1, d2):
return set(d1) - set(d2)
parole = elabora_file('words.txt')
diff = sottrai(isto, parole)
#for parola in diff:
# print(parola, end=' ')
def parola_a_caso(h):
t = []
for parola, freq in h.items():
t.extend([parola] * freq)
return random.choice(t)
def somma_cumulata(istogramma):
somma = 0
output = []
for frequenza in istogramma.values():
nuovo = frequenza + somma
somma += frequenza
output.append(nuovo)
return output
lista_parole = list(isto.keys())
def parola_casuale(istogramma):
lista_freq = somma_cumulata(istogramma)
numero = random.randint(1, lista_freq[-1])
indice = 0
while indice < lista_freq[-1]:
if lista_freq[indice] == numero:
break
else:
indice += 1
return lista_parole[indice]