Skip to content

Commit

Permalink
Python project setup (#29)
Browse files Browse the repository at this point in the history
* adding data files

* baseline NER

* old jupyter files

* New changes for CRF and Negation detection (#26)

* Create HEYDAY_WEEK3.pdf

Upload capstone week 3 process report.

* Added the CRF and BERT-negation

* Changes to crf

* Added crf

* adding data files (#22)

* adding rules (#24)

* Added the semi supervised and BERT negation and combined

Co-authored-by: StarrySkyr <starryskyr@icloud.com>
Co-authored-by: varadp02 <poojari.varadraj@gmail.com>
Co-authored-by: Andrew Stich <81941500+andrewkwstich@users.noreply.github.com>

* Converting to python files

* Rule based negation

* REST API to extract product and attributes

* project restructure

* Update README.md

* Spell check control

* Update README.md

* Update README.md

* Upload apis to add knowledge base for product catalogue

* sample requests

* evaluation boilerplate

Co-authored-by: StarrySkyr <starryskyr@icloud.com>
Co-authored-by: varadp02 <poojari.varadraj@gmail.com>
Co-authored-by: Andrew Stich <81941500+andrewkwstich@users.noreply.github.com>
  • Loading branch information
4 people authored Jun 15, 2022
1 parent ef1262c commit a2a7d86
Show file tree
Hide file tree
Showing 43 changed files with 102,983 additions and 2 deletions.
63 changes: 61 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,63 @@
# NegativeAwareNER
Capstone Project with HeyDay (May-June2022)
Developing a negation-aware named entity recognition system in the product search context

## Developing a negation-aware named entity recognition system in the product search context

## Steps to setup the project

- Run command `conda env update --file env.yaml --prune` to download all the required dependencies
- Create training set for NER model with command `python code/TrainSet_Generator.py`
- Run the API server with `uvicorn main:app --reload --app-dir=code/`
- Create knowledge base for product names by uploading the file using - `http://127.0.0.1:8000/uploadfile/` endpoint

_REQUEST_:

```
curl -X 'POST' \
'http://127.0.0.1:8000/extractInformation/' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"message": "I want a blackk dress with no pink ribbons",
"language": "English",
"correct_spellings": true,
"negationStyle": "tags"
}'
```
_RESPONSE_:
```
{
"tokens": [
"I",
"want",
"a",
"black",
"dress",
"with",
"no",
"pink",
"ribbons"
],
"tags": [
"O",
"O",
"O",
"I-COLOUR",
"I-ATTRIBUTE",
"O",
"O",
"I-ATTRIBUTE",
"I-ATTRIBUTE"
],
"isNegative": [
false,
false,
false,
false,
false,
false,
false,
true,
true
]
}
```
293 changes: 293 additions & 0 deletions code/NER_Model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@

import pandas as pd
import os
import nltk
from spacy.tokenizer import Tokenizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, classification_report
from bs4 import BeautifulSoup
from nltk.corpus import names,gazetteers
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
import re

import sklearn_crfsuite
import csv
from spacy.lang.en import English
import spacy
import contextualSpellCheck

crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=350,
all_possible_transitions=True,
verbose = True
)


# ### Creating features using knowledge bases

colors = pd.read_csv("data/colors.csv")


color_list= []
for index, row in colors.iterrows():
color_list.append(row['Air Force Blue (Raf)'])


products = pd.read_csv('data/products.csv',sep='\t')

product_list =[]
for index, row in products.iterrows():
for i in row[0].split():
if i[-1]=='s':
product_list.append(i[:-1])
else:
product_list.append(i)


product_list = set(product_list)
product_list = list(product_list)


# ### Feature Engineering


def get_pos(word):
tag = nltk.pos_tag([word])
return tag[0][1]

def is_number(string):
return any(char.isdigit() for char in string)

def word2features(sentence, idx):
word_features = {}
word_features['word_lowercase'] = sentence[idx].lower()

## Features looking at the neighbouring words:
if idx > 0:
word_features["pre_word"] = sentence[idx -1].lower()
else:
word_features["pre_word"] = ""
if idx < len(sentence) - 1:
word_features["next_word"] = sentence[idx +1].lower()
else:
word_features["next_word"] = ""

if idx > 1:
word_features["pre2_word"] = sentence[idx -2].lower()
else:
word_features["pre2_word"] = ""

if idx < len(sentence) - 2:
word_features["next2_word"] = sentence[idx +2].lower()
else:
word_features["next2_word"] = ""
## Features loking at the word endings

if len(sentence[idx])> 2:
word_features["last2char"] = sentence[idx][-2:]
else:
word_features["last2char"] = sentence[idx]

if len(sentence[idx])> 3:
word_features["last3char"] = sentence[idx][-3:]
else:
word_features["last3char"] = sentence[idx]

## Features considering the shape of the word

if sentence[idx].isupper():
word_features["upper"] = True
else:
word_features["upper"] = False

if sentence[idx].islower():
word_features["lower"] = True
else:
word_features["lower"] = False

word_features["length"] = len(sentence[idx])
word_features["position"] = idx


## Extra Features:

## Is Number
word_features["number"] = is_number(sentence[idx])

##Is_Noun

if get_pos(sentence[idx])== "NN":
word_features["is_noun"] = True
else:
word_features["is_noun"] = False


# color Feature:

if sentence[idx].lower() in color_list:
word_features["color"] = True
else:
word_features["color"] = False

# Product Feature:

if sentence[idx].lower() in product_list:
word_features["product"] = True
else:
word_features["product"] = False




for i in range(len(sentence)):
if i == idx and i != 0:
word_features['first_word_not_in_title_case'] = sentence[idx].istitle()
elif i == idx and i == 0:
if sentence[idx].istitle():
word_features['first_word_not_in_title_case'] = False


return word_features


def sentence2features(sentence):
return [word2features(sentence, idx) for idx in range(len(sentence))]

def main():

data= pd.read_csv('data/Heyday_trainingdata.csv')

data['Sentence']= data[['Sentence_Index','Tokens','Tags']].groupby(['Sentence_Index'])['Tokens'].transform(lambda x: ' '.join(x))


data['Tags']= data[['Sentence_Index','Tokens','Tags']].groupby(['Sentence_Index'])['Tags'].transform(lambda x: ','.join(x))


data = data[['Sentence','Tags']]

data = data.drop_duplicates().reset_index(drop=True)

# ### Loading the training and validation set

train_dataset= data
test_dataset = pd.read_csv('data/un-negated_clean_data.csv')
test_dataset = test_dataset[["Tokens","Tags","sentence"]]
test_dataset = test_dataset.replace('B-','I-', regex=True)
test_dataset['Sentence']= test_dataset[['sentence','Tokens','Tags']].groupby(['sentence'])['Tokens'].transform(lambda x: ' '.join(x))
test_dataset['Tags']= test_dataset[['sentence','Tokens','Tags']].groupby(['sentence'])['Tags'].transform(lambda x: ','.join(x))
test_dataset = test_dataset[['Sentence','Tags']]
test_dataset = test_dataset.drop_duplicates().reset_index(drop=True)


def strsplit_tags(tags):
sent_list = tags.split(",")
return sent_list



def strsplit_sentence(sentence):
sent_list = sentence.split(" ")
return sent_list

train_sents = []

for index, row in train_dataset.iterrows():
train_sents.append((strsplit_sentence(row['Sentence']),strsplit_tags(row['Tags'])))


dev_sents = []

for index, row in test_dataset.iterrows():
dev_sents.append((strsplit_sentence(row['Sentence']),strsplit_tags(row['Tags'])))

def prepare_ner_feature_dicts(sents):
'''ner_files is a list of Ontonotes files with NER annotations. Returns feature dictionaries and
IOB tags for each token in the entire dataset'''
all_dicts = []
all_tags = []
# your code here
for tokens, tags in sents:
all_dicts.append(sentence2features(tokens))
all_tags.append(tags)

return all_dicts, all_tags


train_dicts, train_tags = prepare_ner_feature_dicts(train_sents)
dev_dicts, dev_tags = prepare_ner_feature_dicts(dev_sents)

# ### CRF Model


crf.fit(train_dicts, train_tags)
try:
call_produces_an_error()
except:
pass


# ### Results on the validation set


def flatten(l):
result = []
for sub in l:
result.extend(sub)
return result

y_pred = crf.predict(dev_dicts)
print(f1_score(flatten(dev_tags), flatten(y_pred), average='macro'))
print(f1_score(flatten(dev_tags), flatten(y_pred), average='micro'))
print(classification_report(flatten(dev_tags), flatten(y_pred)))


#Convert a sentence into feature dictionary

# print(dev_sents)

# ### Tagging the Validation set

val_set= pd.read_csv('data/un-negated_clean_data.csv')

token_list = val_set.Tokens.values.tolist()
sentence_list = val_set.sentence.values.tolist()


rows = zip(sentence_list,token_list,flatten(y_pred))
with open('data/Heyday_validationdata.csv', "w") as f:
writer = csv.writer(f)
writer.writerow(('Sentence_Index','Tokens','Predicted_NER_Tags'))
for row in rows:
writer.writerow(row)

print(test_dataset)

def predict(sentence, correct_spellings=True):
nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)
input_str = sentence
doc = nlp(input_str)
if correct_spellings:
tokens = [token.text for token in nlp(doc._.outcome_spellCheck)]
else:
tokens = [token.text for token in doc]

print(tokens)

feat = sentence2features(tokens)
output_str = crf.predict([feat])
print(output_str)
result = []
for token, tag in zip(tokens, output_str[0]):
result.append((token, tag))
return result


if __name__ == "__main__":
main()
Loading

0 comments on commit a2a7d86

Please sign in to comment.