Skip to content

Commit

Permalink
saving script 정리
Browse files Browse the repository at this point in the history
  • Loading branch information
dkstlzu committed Aug 3, 2020
1 parent 6cd3e38 commit 432dd3e
Show file tree
Hide file tree
Showing 10 changed files with 181 additions and 781 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
**/Test
**/Test**
.ipynb_checkpoints
79 changes: 79 additions & 0 deletions NLP/Saving Cases in Local.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: requests in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (2.24.0)\n",
"Requirement already satisfied: idna<3,>=2.5 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (1.25.10)\n",
"Requirement already satisfied: certifi>=2017.4.17 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (2020.6.20)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (3.0.4)\n",
"Requirement already satisfied: lxml in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (4.5.2)\n"
]
}
],
"source": [
"!pip install requests\n",
"!pip install lxml"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'lxml'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-4-29b1022e5def>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mcases_lxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlxml_OpenAPI\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mall_in_one\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mdo\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32mD:\\DProjects\\main\\NLP\\cases_lxml\\lxml_OpenAPI.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrequests\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mrq\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mlxml\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mlxml\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0metree\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0met\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mio\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'lxml'"
]
}
],
"source": [
"import cases_lxml.lxml_OpenAPI.all_in_one as do"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"do('손해배상', 3000)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Empty file added NLP/cases_lxml/__init__.py
Empty file.
Binary file added NLP/cases_lxml/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file added NLP/cases_lxml/__pycache__/lxml.cpython-38.pyc
Binary file not shown.
Binary file not shown.
87 changes: 87 additions & 0 deletions NLP/cases_lxml/lxml_OpenAPI.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Requests 모듈을 이용하여 오픈 API로부터 판례 목록, 판례 본문등을 가져올 수 있습니다.
원하는 판례 목록들을 XML파일로 받아온뒤
LXML 모듈을 이용하여 판례일련번호만을 추출하여 판례 본문을 가져오는 API에 전달하게 됩니다.
여기서 본 프로젝트에서 사용하게 될 판시사항, 판결요지, 판결내용만을 추려 각각 txt파일로 저장합니다.
저장된 수많은 txt파일을 sentencepiece에 학습시키기 위해 하나의 txt파일로 합쳐 total_cas.txt로 저장합니다.
"""



import requests as rq
import re
import lxml
from lxml import etree as et
from io import StringIO

# 특정 판례일련번호의 lxml element를 가져오는 함수입니다.
def myfunction(case_number):
main = {'OC':'ICTPoolC',
'target':'prec',
'ID': case_number, #가져오고 싶은 최대 판례 수
'type':'XML'}
res_main = rq.get('http://www.law.go.kr/DRF/lawService.do?',params=main)
res_main = re.sub(' encoding="UTF-8"', '', res_main.text)
return et.fromstring(res_main)

# 특정 판례 lxml element에서 원하는 tag의 내용을 가져오는 함수입니다.
def findtext(tree, tag):
for el in tree.iter():
if (el.tag == tag):
return re.sub('<.*?>','', el.text)

# 원하는 정보들만을 txt파일로 저장하는 함수입니다.
def save_local(list_xml, output_path):
# 받아온 판례목록 전처리
data = re.sub(' encoding="UTF-8"', '', list_xml.text)
data = et.XML(data)

#원하는 정보만을 뽑아내는 과정
for el in data.iter():
if (el.tag == "판례일련번호"):
f = open(os.getcwd() + '\\' + output_path + '\\' + el.text + ".txt", 'w')
f.write(findtext(myfunction(el.text), '판례정보일련번호') + '\n')
f.write(findtext(myfunction(el.text), '사건명') + '\n')
f.write(findtext(myfunction(el.text), '판시사항') + '\n')
f.write(findtext(myfunction(el.text), '판결요지') + '\n')
f.write(findtext(myfunction(el.text), '판례내용') + '\n')
f.close()
print(el.text + ' : saved')

# 여러개의 txt파일을 하나로 합쳐주는 함수입니다.
def txt_concat(path, output_name):

file_list = os.listdir(path)
output = open(output_name + '.txt', 'w')
for name in file_list:
if ".txt" not in name:
continue
f = open(path + '\\' + name, 'r')
for line in f:
output.write(line)
output.write('\n')

f.close()
output.close()

# 모든 동작을 하나로 묶어주는 함수로 실제로 사용하게 될 함수입니다.
def all_in_one(keyword, case_num):
## 목록 request templat
chart= {'OC':'ICTPoolC'
,'target':'prec'
,'type':'XML'
,'query': keyword
,'display': case_num
,'curt':'대법원'
,'prncYd':'20000101~20191231'
}

## 목록 받아오기
res_chart=rq.get('http://www.law.go.kr/DRF/lawSearch.do?',params=chart)

save_local(res_chart, 'Case_Main')
txt_concat(os.getcwd() + '\\Case_Main\\', 'total_case')

def test1():
print('test1')
Loading

0 comments on commit 432dd3e

Please sign in to comment.