-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanup.py
140 lines (119 loc) · 3.96 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import nltk
import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')
import pprint
reg = re.compile(r'[A-Z]+')
dance = open("DanceDomainText.txt")
arpabet = nltk.corpus.cmudict.dict()
weightDict = {}
pronDict = {}
countDict = {}
wordDict = {}
wordWeightDict = {}
diphoneDict = {}
sentenceWeight = {}
words = []
lowered = []
barephonelist = []
englishPhones = []
nums = ['0', '1', '2']
#convert text to sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data=dance.read()
sentenceList = tokenizer.tokenize(data)
#convert sentences to words and add them to wordDict, with a freq count for each word in the overall text
def getWordsFromText(sentenceList):
for sentence in sentenceList:
words = nltk.word_tokenize(sentence)
for word in words:
if word in wordDict:
wordDict[word] += 1
else:
wordDict[word] = 1
# convert words to lowercase
def unCapitalize(wordDict):
for word in wordDict:
low_word = word.lower()
lowered.append(low_word)
# get the pronunciations of words from cmudict and format them in pronDict as word: pronunciation
def createProns(pronDict):
for word in lowered:
if word in pronDict:
pass
else:
if word in arpabet:
pronDict[word] = arpabet[word][0]
else:
pass
# strip the phones of their u' tags and tone numbers, put them in their own list as well as updating the pronDict
def stripPhones(pronDict):
for word in pronDict:
newList = []
for phone in pronDict[word]:
if len(phone) < 3:
phone = phone.lower()
phone = str(phone)
barephonelist.append(phone)
newList.append(phone)
elif phone[2] in nums:
phone = phone.lower()
phone = str(phone)
phone = phone[0:2]
barephonelist.append(phone)
newList.append(phone)
pronDict[word] = newList
# give frequency weights to the bare phonemes as they appear in the text
def getPhoneWeights(barephonelist):
for phoneme in barephonelist:
if phoneme in weightDict:
weightDict[phoneme] += 1
else:
weightDict[phoneme] = 1
#create diphones -- this remains unused
def getEnglishPhones(weightDict):
for phone in weightDict.keys():
englishPhones.append(phone)
def getDiphones(englishPhones):
for phone in englishPhones:
for phone2 in englishPhones:
diphone = (phone, phone2)
if diphone in diphoneDict:
diphoneDict[diphone] += 1
else:
diphoneDict[diphone] = 1
# give each word a weight which is the sum of all phonemes' weights within that word
def getWordWeights(pronDict):
for word in pronDict:
weightsum = 0
for phone in pronDict[word]:
weightsum += weightDict[phone]
wordWeightDict[word] = weightsum
# give each sentence a final weight. The weight is a sum of its words' weights, divided by the length of the sentence.
# Pass if the word is not in nltk's dictionary.
def getSentenceWeight(sentenceList):
for sentence in sentenceList:
words = nltk.word_tokenize(sentence)
sentsum = 0
for word in words:
try:
small = word.lower()
sentsum += wordWeightDict[small]
except KeyError:
pass
sentsum /= len(sentence)
sentenceWeight[sentence] = sentsum
getWordsFromText(sentenceList)
unCapitalize(wordDict)
createProns(pronDict)
stripPhones(pronDict)
getPhoneWeights(barephonelist)
getEnglishPhones(weightDict)
getDiphones(englishPhones)
getWordWeights(pronDict)
getSentenceWeight(sentenceList)
with open('DanceDomainSentences.txt', 'w') as f:
sortedsw = sorted(sentenceWeight.items(), key=lambda x: x[1], reverse=True)
for i in sortedsw:
f.write(str(i) + "\n")