forked from ippnsj/Lawbot
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
181 additions
and
781 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
**/Test | ||
**/Test** | ||
.ipynb_checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: requests in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (2.24.0)\n", | ||
"Requirement already satisfied: idna<3,>=2.5 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (2.10)\n", | ||
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (1.25.10)\n", | ||
"Requirement already satisfied: certifi>=2017.4.17 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (2020.6.20)\n", | ||
"Requirement already satisfied: chardet<4,>=3.0.2 in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (from requests) (3.0.4)\n", | ||
"Requirement already satisfied: lxml in d:\\dprojects\\main\\nlpenv\\lib\\site-packages (4.5.2)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install requests\n", | ||
"!pip install lxml" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "ModuleNotFoundError", | ||
"evalue": "No module named 'lxml'", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", | ||
"\u001b[1;32m<ipython-input-4-29b1022e5def>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mcases_lxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlxml_OpenAPI\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mall_in_one\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mdo\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | ||
"\u001b[1;32mD:\\DProjects\\main\\NLP\\cases_lxml\\lxml_OpenAPI.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrequests\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mrq\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mlxml\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mlxml\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0metree\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0met\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mio\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | ||
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'lxml'" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import cases_lxml.lxml_OpenAPI.all_in_one as do" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"do('손해배상', 3000)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
""" | ||
Requests 모듈을 이용하여 오픈 API로부터 판례 목록, 판례 본문등을 가져올 수 있습니다. | ||
원하는 판례 목록들을 XML파일로 받아온뒤 | ||
LXML 모듈을 이용하여 판례일련번호만을 추출하여 판례 본문을 가져오는 API에 전달하게 됩니다. | ||
여기서 본 프로젝트에서 사용하게 될 판시사항, 판결요지, 판결내용만을 추려 각각 txt파일로 저장합니다. | ||
저장된 수많은 txt파일을 sentencepiece에 학습시키기 위해 하나의 txt파일로 합쳐 total_cas.txt로 저장합니다. | ||
""" | ||
|
||
|
||
|
||
import requests as rq | ||
import re | ||
import lxml | ||
from lxml import etree as et | ||
from io import StringIO | ||
|
||
# 특정 판례일련번호의 lxml element를 가져오는 함수입니다. | ||
def myfunction(case_number): | ||
main = {'OC':'ICTPoolC', | ||
'target':'prec', | ||
'ID': case_number, #가져오고 싶은 최대 판례 수 | ||
'type':'XML'} | ||
res_main = rq.get('http://www.law.go.kr/DRF/lawService.do?',params=main) | ||
res_main = re.sub(' encoding="UTF-8"', '', res_main.text) | ||
return et.fromstring(res_main) | ||
|
||
# 특정 판례 lxml element에서 원하는 tag의 내용을 가져오는 함수입니다. | ||
def findtext(tree, tag): | ||
for el in tree.iter(): | ||
if (el.tag == tag): | ||
return re.sub('<.*?>','', el.text) | ||
|
||
# 원하는 정보들만을 txt파일로 저장하는 함수입니다. | ||
def save_local(list_xml, output_path): | ||
# 받아온 판례목록 전처리 | ||
data = re.sub(' encoding="UTF-8"', '', list_xml.text) | ||
data = et.XML(data) | ||
|
||
#원하는 정보만을 뽑아내는 과정 | ||
for el in data.iter(): | ||
if (el.tag == "판례일련번호"): | ||
f = open(os.getcwd() + '\\' + output_path + '\\' + el.text + ".txt", 'w') | ||
f.write(findtext(myfunction(el.text), '판례정보일련번호') + '\n') | ||
f.write(findtext(myfunction(el.text), '사건명') + '\n') | ||
f.write(findtext(myfunction(el.text), '판시사항') + '\n') | ||
f.write(findtext(myfunction(el.text), '판결요지') + '\n') | ||
f.write(findtext(myfunction(el.text), '판례내용') + '\n') | ||
f.close() | ||
print(el.text + ' : saved') | ||
|
||
# 여러개의 txt파일을 하나로 합쳐주는 함수입니다. | ||
def txt_concat(path, output_name): | ||
|
||
file_list = os.listdir(path) | ||
output = open(output_name + '.txt', 'w') | ||
for name in file_list: | ||
if ".txt" not in name: | ||
continue | ||
f = open(path + '\\' + name, 'r') | ||
for line in f: | ||
output.write(line) | ||
output.write('\n') | ||
|
||
f.close() | ||
output.close() | ||
|
||
# 모든 동작을 하나로 묶어주는 함수로 실제로 사용하게 될 함수입니다. | ||
def all_in_one(keyword, case_num): | ||
## 목록 request templat | ||
chart= {'OC':'ICTPoolC' | ||
,'target':'prec' | ||
,'type':'XML' | ||
,'query': keyword | ||
,'display': case_num | ||
,'curt':'대법원' | ||
,'prncYd':'20000101~20191231' | ||
} | ||
|
||
## 목록 받아오기 | ||
res_chart=rq.get('http://www.law.go.kr/DRF/lawSearch.do?',params=chart) | ||
|
||
save_local(res_chart, 'Case_Main') | ||
txt_concat(os.getcwd() + '\\Case_Main\\', 'total_case') | ||
|
||
def test1(): | ||
print('test1') |
Oops, something went wrong.