-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords.py
145 lines (137 loc) · 4.53 KB
/
words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import sys
import re
# /u1/junk/cs617/Books
class Word:
def __init__(self,word):
self.word = word.lower()
def isDoubleLined(self):
if self.word[-1:] == '-':
return True
def hasDashes(self):
if "-" in self.word:
return True
def isValidLength(self):
if len(self.word) >= 5 and len(self.word) <= 9:
return True
else:
return False
def combine(self,wordTwo):
self.word = self.word + wordTwo
self.word = self.word.replace("-","")
class Dictionary:
def __init__(self):
self.dict = dict()
def checkAndUpdate(self,word):
if word in self.dict:
self.dict[word] = self.dict[word] + 1
else:
self.dict[word] = 1
def getDictionaryStats(self):
return self.dict
class Filter:
def NonWords(words):
words = re.split('[^A-Za-z-]',words)
words = filter(None,words)
return list(words)
def doubleLinedWords(words):
#Dictionary is my own personal Dictionary object
#Method also returns dictionary with stats of each word
updatedWords = Dictionary()
flag = 0
dashedWord = ''
for idx, word in enumerate(words):
if flag == 1:
flag = 0
if currentWord.isValidLength():
updatedWords.checkAndUpdate(currentWord.word)
continue
currentWord = Word(word)
if currentWord.isDoubleLined():
currentWord.combine(words[idx + 1])
flag = 1
continue
if currentWord.hasDashes():
splitWords = re.split('-',currentWord.word)
for splitee in splitWords:
splitee = Word(splitee)
if splitee.isValidLength():
updatedWords.checkAndUpdate(splitee.word)
elif currentWord.isValidLength():
updatedWords.checkAndUpdate(currentWord.word)
return updatedWords
class FileHandler:
def __init__(self,fileName):
self.fileName = fileName
self.words = None
self.dictionary = None
self.readFile()
def readFile(self):
try:
with open(self.fileName, "r") as fileData:
self.words = self.organizeIntoDict(fileData)
except IOError:
print(self.fileName + " Not Found\n")
def organizeIntoDict(self,data):
fileData = data.read()
words = Filter.NonWords(fileData)
finalDictionary = Filter.doubleLinedWords(words)
self.dictionary = finalDictionary.getDictionaryStats()
class WordChart:
def __init__(self,dictionaries):
self.bookDictionaries = dictionaries
self.wordChart = dict()
self.totalWordCountsForAuthors = dict()
self.wordFrequencyOfEachBook = []
self.austinTotalImportantWords = 0
self.dickensTotalImportantWords= 0
def collect(self):
allWords = set([])
for bookDictionary in self.bookDictionaries:
allWords = allWords | set(bookDictionary.keys())
for word in allWords:
self.wordChart[word] = []
for i in range(0,len(self.bookDictionaries)):
currentDictionary = self.bookDictionaries[i]
try:
self.wordChart[word].append(currentDictionary[word])
except KeyError:
self.wordChart[word].append(0)
self.checkIfImportant(word)
def checkIfImportant(self,word):
wordData = self.wordChart[word]
if sum(wordData) > 50:
print(word)
austinCurrentWC = sum(wordData[:4])
dickensCurrentWC = sum(wordData[4:])
self.austinTotalImportantWords += austinCurrentWC
self.dickensTotalImportantWords += dickensCurrentWC
self.totalWordCountsForAuthors[word] = [austinCurrentWC,dickensCurrentWC]
print(self.totalWordCountsForAuthors[word])
else:
self.wordChart.pop(word)
class BookCollectionHandler:
def __init__(self,bookPaths):
self.bookPaths = bookPaths
self.bookDictionaries = []
self.wordCounterObjects = []
self.getDictionaryStats()
#bookDictionaries is an array of FileHandlerObjects
def getDictionaryStats(self):
for book in self.bookPaths:
bookFile = FileHandler(book)
self.bookDictionaries.append(bookFile.dictionary)
def CreateWordDictionary(self):
chart = WordChart(self.bookDictionaries)
chart.collect()
#chart.computeProbabilities()
if __name__ == "__main__":
bookPaths = ["Books/austen.em.txt",
"Books/austen.pp.txt",
"Books/austen.pe.txt",
"Books/austen.ss.txt",
"Books/dickens.ge.txt",
"Books/dickens.ht.txt",
"Books/dickens.tc.txt",
"Books/dickens.ot.txt"]
Books = BookCollectionHandler(bookPaths)
Books.CreateWordDictionary()