Skip to content

lib #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions L0_DD_Full_Runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
import time
import subprocess
import sys
import logging

print('\nOmniscien Domain Detector - Launcher')
print('============================\n')

#Arguments List Description
"""
libPATH = Path for lib which include all Domain Detection scripts
InputDir = Path for the Data that will be used for Language Model Training
OutputDir = Path for LM after training
langID = Language ID to use for Domain Detection
nGram = NGram number to use in LM training
moses = Path to mosesdecoder
kenLM = path to kenLM
log_path = Path to store all System Logs
P2_Path_to_Input_files = Path to input file that need to check if its In-Domain
P2_Path_to_output_files = Output folder in which the Indomain files will be stored
Deactivate_LM_Training = 0/1 required field to run all the process including Lm training or run only domain Detection on new files
"""

#Arguments List
try :
libPATH = sys.argv[1]
InputDir = sys.argv[2]
OutputDir = sys.argv[3]
langID = sys.argv[4]
nGram = sys.argv[5]
moses = sys.argv[6]
kenLM = sys.argv[7]
log_path = sys.argv[8]
P2_Path_to_Input_files = sys.argv[9]
P2_Path_to_output_files = sys.argv[10]
Deactivate_LM_Training = int(sys.argv[11])

except :
print('Not the right format--- \n'\
'usage: L0_DD_Full_Runner.py {libPATH} {Input_Folder} {Output_Folder} {langID} {nGram} {moses_path} {KenLm_path} {log_path} {Input_Indomain_test_file} {Output_Indomain_test_file} {Deactivate LM Training 0/1 }')
sys.exit(1)

#logger Settings
log_path_L0 = log_path + 'L0_DD_Log_Full_Runner.txt'
logger = logging.getLogger('L0_DD_Log_Full_Runner.txt')
hdlr = logging.FileHandler(log_path_L0)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
#Set LM
P2_model_file=str(OutputDir)+str(langID.upper())+"_Lm.bin"

if int(Deactivate_LM_Training) == 1: #Run Full Mode
print("Running All steps")
proc1 = "python3.6 "+str(libPATH)+"P1_DD_TokandLM.py"+" "+str(InputDir)+" "+str(OutputDir) +" "+str(langID)+" "+str(nGram)+" "+str(moses)+" "+str(kenLM)+" "+str(log_path)
subprocess.call(proc1, shell=True)
print(proc1)
print("*********")
for item_tmp in os.listdir(P2_Path_to_Input_files):
logger.info("Start :\t"+str(item_tmp))
print('processing item :' , item_tmp)
P2_output_file = item_tmp.replace('txt','scoring.xml')
print("python3.6 "+str(libPATH)+"P2_DD_LMScoring.py"+" "+str(P2_Path_to_Input_files)+str(item_tmp) + " " +str(P2_Path_to_output_files)+str(P2_output_file)+" "+str(P2_model_file)+" "+str(moses)+" "+str(langID)+" "+str(log_path))
subprocess.call("python3.6 "+str(libPATH)+"P2_DD_LMScoring.py"+" "+str(P2_Path_to_Input_files)+str(item_tmp) + " " +str(P2_Path_to_output_files)+str(P2_output_file)+" "+str(P2_model_file)+" "+str(moses)+" "+str(langID)+" "+str(log_path), shell=True)
P3_input_file = str(P2_Path_to_Input_files)+str(item_tmp)
P3_xml_file = str(P2_Path_to_output_files)+str(P2_output_file)
P3_outputscore_file=str(P2_Path_to_output_files)+P2_output_file.replace('scoring.xml','extracted-score.txt')
P3_outputsentences_file=str(P2_Path_to_output_files)+P2_output_file.replace('scoring.xml','InDomain-Sentences.txt')
subprocess.call("python3.6 "+str(libPATH)+"P3_DD_Extract.py"+" "+str(P3_input_file)+" "+str(P3_xml_file)+" "+str(P3_outputscore_file)+" "+str(P3_outputsentences_file)+" "+str(log_path), shell=True)
logger.info("End :\t"+str(item_tmp))
else : #Run LM scoring and Extract
print("Running Step 2 & 3")
for item_tmp in os.listdir(P2_Path_to_Input_files):
logger.info("Start :\t"+str(item_tmp))
print('processing item :' , item_tmp)
P2_output_file = item_tmp.replace('txt','scoring.xml')
print("python3.6 "+str(libPATH)+"P2_DD_LMScoring.py"+" "+str(P2_Path_to_Input_files)+str(item_tmp) + " " +str(P2_Path_to_output_files)+str(P2_output_file)+" "+str(P2_model_file)+" "+str(moses)+" "+str(langID)+" "+str(log_path))
subprocess.call("python3.6 "+str(libPATH)+"P2_DD_LMScoring.py"+" "+str(P2_Path_to_Input_files)+str(item_tmp) + " " +str(P2_Path_to_output_files)+str(P2_output_file)+" "+str(P2_model_file)+" "+str(moses)+" "+str(langID)+" "+str(log_path), shell=True)
P3_input_file = str(P2_Path_to_Input_files)+str(item_tmp)
P3_xml_file = str(P2_Path_to_output_files)+str(P2_output_file)
P3_outputscore_file=str(P2_Path_to_output_files)+P2_output_file.replace('scoring.xml','extracted-score.txt')
P3_outputsentences_file=str(P2_Path_to_output_files)+P2_output_file.replace('scoring.xml','InDomain-Sentences.txt')
subprocess.call("python3.6 "+str(libPATH)+"P3_DD_Extract.py"+" "+str(P3_input_file)+" "+str(P3_xml_file)+" "+str(P3_outputscore_file)+" "+str(P3_outputsentences_file)+" "+str(log_path), shell=True)
logger.info("End :\t"+str(item_tmp))
subprocess.call("mv "+ str(P2_Path_to_Input_files)+"*.Tok.txt "+str(P2_Path_to_output_files), shell=True)
70 changes: 70 additions & 0 deletions P1_DD_TokandLM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import time
import subprocess
import sys
import logging

print('Omniscien Domain Detector P1')
print('==========================')

#Arguments List Description
"""
InputDir = Path for the Data that will be used for Language Model Training
OutputDir = Path for LM after training
langID = Language ID to use for Domain Detection
nGram = NGram number to use in LM training
moses = Path to mosesdecoder
kenLM = path to kenLM
log_path = Path to store all System Logs
"""

#Arguments List
try :
InputDir = sys.argv[1]
OutputDir = sys.argv[2]
langID = sys.argv[3]
nGram = sys.argv[4]
moses = sys.argv[5]
kenLM = sys.argv[6]
log_path = sys.argv[7]
except :
print('Not the right syntax--- \n'\
'usage: P1_DD_TokandLM.py {Input_Folder} {Output_Folder} {langID} {nGram} {moses} {KenLm} {log_path}')
sys.exit(1)

#logger Settings
log_path = log_path + 'P1_DD_Log_TOKandML.txt'
logger = logging.getLogger('P1_DD_Log_TOKandML.txt')
hdlr = logging.FileHandler(log_path)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)

#Run Tokenizer and Prepare Data for LM training
for item_tmp in os.listdir(InputDir):
logger.info("Start :\t"+str(item_tmp))
print('processing item :' , item_tmp)
subprocess.call("cp "+str(InputDir)+str(item_tmp) + " " + str(OutputDir), shell=True)
subprocess.call(str(moses)+"scripts/tokenizer/tokenizer.perl -l "+str(langID)+" -threads 4 <"+str(OutputDir)+str(item_tmp) + "> " + str(OutputDir)+str(item_tmp).replace('.txt',str(langID.upper())+'.Tok.txt'), shell=True)
subprocess.call("rm -rf "+str(OutputDir)+str(item_tmp), shell=True)
logger.info("End :\t"+str(item_tmp))

#LM Training
def main():
subprocess.call("cat "+str(OutputDir)+"*"+str(langID.upper())+".Tok.txt > " + str(OutputDir)+str(langID.upper())+"_Lm_data.txt", shell=True)
subprocess.call("rm -rf "+str(OutputDir)+"*"+str(langID.upper())+".Tok.txt" , shell=True)
print('Prepare for Training LM========>')
os.system(str(kenLM)+"build/bin/lmplz -o " + str(nGram) + " -S 50% --skip_symbols < " + str(OutputDir)+str(langID.upper())+"_Lm_data.txt" +" > "+ str(OutputDir)+str("tmp.arpa"))
os.system(str(kenLM)+"build/bin/build_binary "+str(OutputDir)+str("tmp.arpa")+" "+str(OutputDir)+str(langID.upper())+"_Lm.bin")

#Run Main Process
if __name__== "__main__":
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
print ("=== Start at:\t"+str(start_time))
logger.info("Start- Prepare LM")
main()
print('Done')
logger.info("End- Prepare LM")
end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
print ("=== End at:\t"+str(end_time))
109 changes: 109 additions & 0 deletions P2_DD_LMScoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import numpy as np
import xml.etree.ElementTree as ET
from lxml import etree
import kenlm
import sys
import time
import os
import logging
import subprocess

print('Omniscien Domain Detector P2')
print('==========================')

#Arguments List Description
"""
InputFile = Path+name of input File that run throught LM
OutputFile = Path+name of output file of LadderXMLfile that have Indomain scores for all sentences
ModelPath = Path to LM
moses = Path to mosesdecoder
langID = Language ID to use for Domain Detection
log_path = Path to store all System Logs
"""

#Arguments List
try :
InputFile = sys.argv[1]
OutputFile = sys.argv[2]
ModelPath = sys.argv[3]
moses = sys.argv[4]
langID = sys.argv[5]
log_path = sys.argv[6]

except :
print('Not the right syntax--- \n'\
'usage: P2_DD_LMScoring.py {Input_File} {Output_File} {Model_Path} {moses_Path} {lang_ID} {log_path}')
sys.exit(1)

#logger Settings
log_path = log_path + 'P2_DD_Log_LMScoring.txt'
logger = logging.getLogger('P2_DD_Log_LMScoring.txt')
hdlr = logging.FileHandler(log_path)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)

#Settings
print("Loding the model ======> ")
model = kenlm.LanguageModel(str(ModelPath))
list_Sentences = []
list_Scores = []
list_Soft_Scores_w = []
#Transformation Scores for Lm
def transform_func(x):
return (1 / (1 + np.exp((np.log(float(-x))))))
#Ladder File creation 2 functions
def create_xml():
sentences = ET.Element("sentences")
sentences = ET.SubElement(sentences,"sentences")
for sent_nb in range(len(all_sentences)):
sent_i = ET.SubElement(sentences,"sent")
score_s = ET.SubElement(sent_i,"s_s")
score_s.text =str(sent_nb + 1)+"\t"+str(all_scores[sent_nb])+"\t"+str(list_Soft_Scores_w[sent_nb])
tree = ET.ElementTree(sentences)
tree.write(str(OutputFile),encoding='utf-8', xml_declaration=True)

def prettyPrintXml(xmlFilePathToPrettyPrint):
assert xmlFilePathToPrettyPrint is not None
parser = etree.XMLParser(resolve_entities=False, strip_cdata=False)
document = etree.parse(xmlFilePathToPrettyPrint, parser)
document.write(xmlFilePathToPrettyPrint, pretty_print=True, encoding='utf-8')

#Prepare the new input files
subprocess.call(str(moses)+"scripts/tokenizer/tokenizer.perl -l "+str(langID)+" -threads 4 <"+str(InputFile)+ "> " + str(InputFile).replace('.txt',str(langID.upper())+'.Tok.txt'), shell=True)
print("moses " +str(moses)+"scripts/tokenizer/tokenizer.perl -l "+str(langID)+" -threads 4 <"+str(InputFile)+ "> " + str(InputFile).replace('.txt',str(langID.upper())+'.Tok.txt'))
InputFile2 = str(InputFile).replace('.txt',str(langID.upper())+'.Tok.txt')
#fill in list Sentences
with open(str(InputFile2),mode = "r" , encoding = 'utf-8') as my_matcher :
logger.info("Start- Prepare Input")
for item1 in my_matcher:
list_Sentences.append(item1.replace('\n',''))
logger.info("End- Prepare Input")
#fill in List scores
with open(str(InputFile2),mode = "r" , encoding = 'utf-8') as my_matcher :
logger.info("Start- Scoring Input")
for item2 in my_matcher:
list_Scores.append(float(model.score(str(item2))))
logger.info("Start- Scoring Input")

all_sentences = list_Sentences
all_scores = list_Scores
logger.info("Start- Transform Scores")
for w in (all_scores):
list_Soft_Scores_w.append(transform_func(float(w))*10)
logger.info("End- Transform Score")

#Run the main Process
if __name__== "__main__":
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
print ("=== Start at:\t"+str(start_time))
logger.info("Start- Prepare XML")
print("LSS-w",list_Soft_Scores_w)
create_xml()
prettyPrintXml(str(OutputFile))
print("Ladder File IsReady !")
logger.info("End- Prepare XML")

end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
print ("=== End at:\t"+str(end_time))
68 changes: 68 additions & 0 deletions P3_DD_Extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import time
import subprocess
import sys
import xml.etree.ElementTree as ET
import logging

print('\nOmniscien Domain Detector P3')
print('============================\n')

#Arguments List Description
"""
Inputfile = Path+name of input File that run throught LM
LadderXMLfile = Path+name of output file .xml that have In domain score for all sentences
OutputScore = Path+name of output file that contain selected items by scores
OutputFile = Path+name of output file that contain selected sentences
log_path = Path to store all System Logs
"""

#Arguments List
try :
Input_File = sys.argv[1]
LadderXMLfile = sys.argv[2]
OutputScore = sys.argv[3]
OutputFile = sys.argv[4]
log_path = sys.argv[5]

except :
print('Not the right format--- \n'\
'usage: P3_DD_Extract.py {Input_File} {LadderXMLfile} {OutputScore} {Output_File} {log_path} [threshold --default 0.5]')
sys.exit(1)
try:
threshold = sys.argv[6]
except :
threshold = 0.5
pass
#logger Settings
log_path = log_path + 'P3_DD_Log_Extract.txt'
logger = logging.getLogger('P1_DD_Log_TOKandML.txt')
hdlr = logging.FileHandler(log_path)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
#Set the Extractor
i = 0
print("Selected Threshold: \t",float(threshold))
logger.info("Start -Extract")
list_Index = []
root =ET.parse(str(LadderXMLfile)).getroot()
all_items = root.findall("sent")
# Get Items by Threshold
for item in all_items:
if float(item.find('s_s').text.strip().split("\t")[2]) >= float(threshold) :
with open (str(OutputScore), mode = "a" , encoding = 'utf-8') as extract_result :
print(item.find('s_s').text.strip().split("\t")[0]+'\t'+item.find('s_s').text.strip().split("\t")[2] , file= extract_result)
i +=1
list_Index.append(item.find('s_s').text.strip().split("\t")[0])
#Get the correct Item
with open(str(Input_File),mode ='r', encoding ='utf-8') as original_data:
with open(str(OutputFile),mode ='a', encoding ='utf-8') as output_sentences:
x = original_data.readlines()
for l in range(len(list_Index)):
y = list_Index[l]
print (x[int(y)-1].replace('\n',''), file= output_sentences)

print(str(i) + " Items selected from " + str(len(all_items)) )
logger.info("End -Extract")