-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
tblock
committed
Jan 30, 2019
1 parent
30e5760
commit 39368ef
Showing
7 changed files
with
20,692 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Ten Thousand German News Articles Dataset | ||
|
||
For more information visit the detailed [project page](https://tblock.github.io/10kGNAD/). | ||
|
||
1. Install the required python packages `pip install -r requirements.txt`. | ||
2. Download and decompress the `corpus.sqlite3` file into the project root from [here](https://github.com/OFAI/million-post-corpus/releases/download/v1.0.0/million_post_corpus.tar.bz2). | ||
3. Run `python code/extract_dataset_from_sqlite.py corpus.sqlite3 articles.csv` to extract the articles. | ||
4. Run `python code/split_articles_into_train_test.py` to split the dataset. | ||
|
||
## License | ||
|
||
All code in this repository is licensed under a MIT License. | ||
|
||
The dataset is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
""" | ||
extract_dataset_from_sqlite.py.py | ||
This exports the articles from the _One Million Posts Corpus_ and generates a | ||
CSV file containing a label and the text for each article. | ||
""" | ||
|
||
|
||
import sys | ||
import csv | ||
import sqlite3 | ||
|
||
from tqdm import tqdm | ||
from bs4 import BeautifulSoup | ||
from argparse import ArgumentParser | ||
|
||
|
||
ARTICLE_QUERY = "SELECT Path, Body FROM Articles WHERE PATH LIKE 'Newsroom/%' AND PATH NOT LIKE 'Newsroom/User%' ORDER BY Path" | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = ArgumentParser() | ||
parser.add_argument(dest="sqlite_file", action="store", help="sqlite input filename", metavar="<SQLite Database>") | ||
parser.add_argument(dest="csv_file", action="store", help="csv output filebame", metavar="<CSV Filename>") | ||
args = parser.parse_args() | ||
|
||
|
||
conn = sqlite3.connect(args.sqlite_file) | ||
cursor = conn.cursor() | ||
|
||
with open(args.csv_file, "w") as csvfile: | ||
writer = csv.writer(csvfile, delimiter=';',quotechar='\'', quoting=csv.QUOTE_MINIMAL) | ||
|
||
for row in tqdm(cursor.execute(ARTICLE_QUERY).fetchall(), unit_scale=True): | ||
path = row[0] | ||
body = row[1] | ||
text = "" | ||
description = "" | ||
|
||
soup = BeautifulSoup(body, 'html.parser') | ||
|
||
# get description from subheadline | ||
description_obj = soup.find('h2',{'itemprop':'description'}) | ||
if description_obj is not None: | ||
description = description_obj.text | ||
description = description.replace("\n"," ").replace("\t"," ").strip() + ". " | ||
|
||
# get text from paragraphs | ||
text_container = soup.find('div',{'class':'copytext'}) | ||
if text_container is not None: | ||
for p in text_container.findAll('p'): | ||
text += p.text.replace("\n"," ").replace("\t"," ").replace("\"","").replace("'","") + " " | ||
text = text.strip() | ||
|
||
# get category from path | ||
category = path.split("/")[1] | ||
sample = [category.encode('utf-8'), description.encode('utf-8') + text.encode('utf-8')] | ||
|
||
# filter empty samples, then write to csv | ||
if sample[1] != "": | ||
writer.writerow(sample) | ||
|
||
conn.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
import csv | ||
import collections | ||
|
||
from sklearn.model_selection import train_test_split | ||
|
||
""" split_articles_into_train_test.py: | ||
processes the dataset and splits the dataset into a training- and testset | ||
""" | ||
|
||
SPLIT_PERCENTAGE = .1 | ||
|
||
|
||
def write_datasets(data, name): | ||
""" write a csv file in a normal and optinally in the fastText format """ | ||
|
||
with open(name + ".csv", "w") as file_write: | ||
writer = csv.writer(file_write, delimiter=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL) | ||
for row in data: | ||
writer.writerow(row) | ||
|
||
# Optionally wirte in the fastText format | ||
# with open("fsttxt_" + name + ".csv", "w") as file_write: | ||
# writer = csv.writer(file_write, delimiter='\t', quotechar='\'', quoting=csv.QUOTE_MINIMAL) | ||
# for row in data: | ||
# label = row[0] | ||
# label = "__label__" + label | ||
# writer.writerow([label, row[1]]) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
labels = [] | ||
texts = [] | ||
|
||
# read full dataset file | ||
with open("articles.csv", "r") as csvfile: | ||
reader = csv.reader(csvfile, delimiter=';', quotechar='\'') | ||
for row in reader: | ||
labels.append(row[0]) | ||
texts.append(row[1]) | ||
|
||
# split dataset | ||
trn_texts, tst_texts, trn_labels, tst_labels = train_test_split(texts, labels, test_size=SPLIT_PERCENTAGE, random_state=42, stratify=labels) | ||
|
||
# write train and test datasets | ||
train = [] | ||
test = [] | ||
|
||
for i in range(len(trn_labels)): | ||
train.append([trn_labels[i], trn_texts[i]]) | ||
|
||
for i in range(len(tst_labels)): | ||
test.append([tst_labels[i], tst_texts[i]]) | ||
|
||
write_datasets(train, "train") | ||
write_datasets(test, "test") | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
sklearn | ||
BeautifulSoup | ||
tqdm |
Oops, something went wrong.