Skip to content

Commit

Permalink
changing paragraph text API and adding or for tags
Browse files Browse the repository at this point in the history
  • Loading branch information
amirekhlasi committed Jan 3, 2019
1 parent fb967a5 commit 7d5b63f
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 45 deletions.
50 changes: 27 additions & 23 deletions API.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_bookshelves(bookshelves_list=None):
return bookshelves


def get_paragraphs(paragraph_id=None, books=None, tags=None, num_sequential=1, Paragraph_Object=True):
def get_paragraphs(paragraph_id=None, books=None, tags=None, num_sequential=1, paragraph_object=True, lowercase=False):
"""
Get paragraphs from args.
Expand All @@ -74,7 +74,8 @@ def get_paragraphs(paragraph_id=None, books=None, tags=None, num_sequential=1, P
list or ... it means the tag should be at least one of those tags. for instance tags = [3, {4, 5}]
means that paragraphs with tag 3 and 4 or 5
num_sequential: the number of sequential paragraphs
Paragraph_Object: if it is True outputs will be type of Paragraph
paragraph_object: if it is True outputs will be type of Paragraph
lowercase: if it is True, then the output will be lowercase. it does not have effect if paragraph_object=True.
Returns:
a list of paragraphs or list of tuples of paragraphs if num_sequential > 1
Expand All @@ -97,27 +98,30 @@ def get_paragraphs(paragraph_id=None, books=None, tags=None, num_sequential=1, P
pars = {i: par for i, par in pars.items() if all([not par.tags.isdisjoint(tag) for tag in tags])}

if num_sequential == 1:
if Paragraph_Object:
if paragraph_object:
return list(pars.values())
else:
return [par.sentences for par in pars.values()]
assert num_sequential > 1
pars2 = []
for par in pars.values():
pp = [par]
next_par = par
flag = True
for k in range(1, num_sequential):
cur_par = next_par
id = cur_par.next_id
if id not in pars:
flag = False
break
next_par = pars[id]
pp.append(next_par)
if flag:
pars2.append(tuple(pp))
if Paragraph_Object:
return pars2
return [par.text(lowercase=lowercase) for par in pars.values()]
elif num_sequential > 1:
pars2 = []
for par in pars.values():
pp = [par]
next_par = par
flag = True
for k in range(1, num_sequential):
cur_par = next_par
id = cur_par.next_id
if id not in pars:
flag = False
break
next_par = pars[id]
pp.append(next_par)
if flag:
pars2.append(tuple(pp))
if paragraph_object:
return pars2
else:
return [tuple(par.text(lowercase=lowercase) for par in pt) for pt in pars2]
else:
return [tuple(par.sentences for par in pt) for pt in pars2]
raise ValueError("num_sequential most be positive")

20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ pars = API.get_paragraphs(books=books, tags=tags, num_sequential=n)
```

* If books is not None, then the returned paragraphs will only belong to that list. books is a list of integers \(books id\) or GutenbergBook.
* If tags are specified then only the paragraphs which include those tags will be returned. It is recommended to specify that parameter because there are a lot of paragraphs that are not litteraly paragraphs
\( like dialogue, titles, and ...\). tags are integer numbers and class Tags in HP specify identify each number. For instance HP.Tags.WITHOUT_DIALOGUE (which is 4) identify the paragraphs without any dialogue.
* If tags are specified then only the paragraphs which include those tags will be returned. It is recommended to specify that parameter because there are a lot of paragraphs that are not litteraly paragraphs. \( like dialogue, titles, and ...\). Tags are integer numbers and class Tags in HP specify identify each number. For instance HP.Tags.WITHOUT_DIALOGUE (which is 4) identify the paragraphs without any dialogue.
The format of tags are a list of integers or lists. which the second lists are dedicated to "or", which means at least one of tags should exist in the paragraph tags. for instant tags=\[1, 2, \[3, 4\]\] means that output paragraphs have tags 1, 2 and at least one of 3 and 4.
* If num_sequential > 1, the output will be a list of tuples, triples or ... that are sequential paragraphs each one have the properties given to method. \( being sequential means paragraphs are sequential in a book\)

Note that you can get paragraphs by paragraph id:
Expand All @@ -36,16 +36,20 @@ Note that you can get paragraphs by paragraph id:
pars = API.get_paragraphs(paragrap_id=ids)
```

if you have memory limit, you can use Paragraph_Object = False to return only paragrap sentences instead of Paragraph Object.
if you have memory limit, you can use paragraph_object = False to return only paragrap sentences instead of Paragraph object.

```python
pars = API.get_paragraphs(Paragraph_Object=False)
pars = API.get_paragraphs(Paragraph_Object=False, lowercase=<True/False>)
```

### Paragraph
Paragraph is a class for working with paragraph easily. it contains, some id (id, next_id, prev_id, book_id) if they exist, tags and text. in order to get text you can use 3 different properties.
* Paragraph.sentences return a list of list. each list is the list of words of a sentence.
* Paragrap.words returns the words of a paragraph.
* Paragraph.text returns the text as a str.
Paragraph is a class for working with paragraph easily. it contains, some id (id, next_id, prev_id, book_id) if they exist, tags and text. in order to get text you can call text() method in the class:
```python
text = par.text(format=<"sentences"/"words"/"text">, lowercase=<True/False>)
```

* format = "sentences" return a list of sentences which each sentence is a list of words.
* format = "words" return a list of words.
* format = "text" return a string.


51 changes: 37 additions & 14 deletions paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ def __init__(self, text, id=None, book_id=None, next_id=None, prev_id=None, tags
def id(self):
return self._id

@property
def text(self):
text = sum(self._text, [])
return " ".join(text)

@property
def has_book(self):
return self._book_id is not None
Expand Down Expand Up @@ -78,13 +73,41 @@ def add_tag(self, new_tags):
def metadata(self):
return paragraph_metadata(self.id, self.book_id, self.prev_id, self.next_id, self.tags)

@property
def sentences(self):
return [[word for word in sent] for sent in self._text]

@property
def words(self):
return sum(self._text, [])
def text(self, format="sentences", lowercase=False):
"""
Return the text of Paragraphs
Args:
format: if it is "sentences" then the output will be a list of lists each list contain the tokens of a sentence.
if it is "words" then the output will be the list of tokens.
if it is "text" then the output will be a string; the text of paragraph
lowercase: a boolean. if it is true then the output will be lowercase
Returns:
depend on format, a list of strings, a list of lists of strings or a string
"""
if format == "sentences":
if lowercase:
return [[word.lower() for word in sent] for sent in self._text]
else:
return [[word for word in sent] for sent in self._text]
elif format == "words":
words = sum([sent for sent in self._text], [])
if lowercase:
return [word.lower() for word in words]
else:
return words
elif format == "text":
words = sum([sent for sent in self._text], [])
text = " ".join(words)
if lowercase:
return text.lowercase()
else:
return text
else:
raise ValueError('format should be one of ["sentences", "words", "text"]')


def create_paragraphs(paragraph_metadata, paragraph_text):
Expand All @@ -102,13 +125,13 @@ def create_paragraphs(paragraph_metadata, paragraph_text):
return dict(pars)


def paragraph_metadata(id=None, book_id=None, prev_id=None, next_id=None,tags=None):
def paragraph_metadata(id=None, book_id=None, prev_id=None, next_id=None, tags=None):
"""
A helper for creating metadata
Args:
id, book_id, prev_id, next_id, are integers. tags is integer or list, set or ... of integers.
id, book_id, prev_id, next_id, are integers. tags is either integer or list, set and ... of integers.
if one of them is None, it will be ignored in return
Returns:
Expand Down

0 comments on commit 7d5b63f

Please sign in to comment.