Skip to content

Commit

Permalink
added code and datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
tblock committed Jan 30, 2019
1 parent 30e5760 commit 39368ef
Show file tree
Hide file tree
Showing 7 changed files with 20,692 additions and 0 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Ten Thousand German News Articles Dataset

For more information visit the detailed [project page](https://tblock.github.io/10kGNAD/).

1. Install the required python packages `pip install -r requirements.txt`.
2. Download and decompress the `corpus.sqlite3` file into the project root from [here](https://github.com/OFAI/million-post-corpus/releases/download/v1.0.0/million_post_corpus.tar.bz2).
3. Run `python code/extract_dataset_from_sqlite.py corpus.sqlite3 articles.csv` to extract the articles.
4. Run `python code/split_articles_into_train_test.py` to split the dataset.

## License

All code in this repository is licensed under a MIT License.

The dataset is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/).
10,273 changes: 10,273 additions & 0 deletions articles.csv

Large diffs are not rendered by default.

67 changes: 67 additions & 0 deletions code/extract_dataset_from_sqlite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python
# coding: utf-8

"""
extract_dataset_from_sqlite.py.py
This exports the articles from the _One Million Posts Corpus_ and generates a
CSV file containing a label and the text for each article.
"""


import sys
import csv
import sqlite3

from tqdm import tqdm
from bs4 import BeautifulSoup
from argparse import ArgumentParser


ARTICLE_QUERY = "SELECT Path, Body FROM Articles WHERE PATH LIKE 'Newsroom/%' AND PATH NOT LIKE 'Newsroom/User%' ORDER BY Path"


if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument(dest="sqlite_file", action="store", help="sqlite input filename", metavar="<SQLite Database>")
parser.add_argument(dest="csv_file", action="store", help="csv output filebame", metavar="<CSV Filename>")
args = parser.parse_args()


conn = sqlite3.connect(args.sqlite_file)
cursor = conn.cursor()

with open(args.csv_file, "w") as csvfile:
writer = csv.writer(csvfile, delimiter=';',quotechar='\'', quoting=csv.QUOTE_MINIMAL)

for row in tqdm(cursor.execute(ARTICLE_QUERY).fetchall(), unit_scale=True):
path = row[0]
body = row[1]
text = ""
description = ""

soup = BeautifulSoup(body, 'html.parser')

# get description from subheadline
description_obj = soup.find('h2',{'itemprop':'description'})
if description_obj is not None:
description = description_obj.text
description = description.replace("\n"," ").replace("\t"," ").strip() + ". "

# get text from paragraphs
text_container = soup.find('div',{'class':'copytext'})
if text_container is not None:
for p in text_container.findAll('p'):
text += p.text.replace("\n"," ").replace("\t"," ").replace("\"","").replace("'","") + " "
text = text.strip()

# get category from path
category = path.split("/")[1]
sample = [category.encode('utf-8'), description.encode('utf-8') + text.encode('utf-8')]

# filter empty samples, then write to csv
if sample[1] != "":
writer.writerow(sample)

conn.close()

62 changes: 62 additions & 0 deletions code/split_articles_into_train_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
# coding: utf-8

import csv
import collections

from sklearn.model_selection import train_test_split

""" split_articles_into_train_test.py:
processes the dataset and splits the dataset into a training- and testset
"""

SPLIT_PERCENTAGE = .1


def write_datasets(data, name):
""" write a csv file in a normal and optinally in the fastText format """

with open(name + ".csv", "w") as file_write:
writer = csv.writer(file_write, delimiter=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
for row in data:
writer.writerow(row)

# Optionally wirte in the fastText format
# with open("fsttxt_" + name + ".csv", "w") as file_write:
# writer = csv.writer(file_write, delimiter='\t', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
# for row in data:
# label = row[0]
# label = "__label__" + label
# writer.writerow([label, row[1]])


if __name__ == "__main__":

labels = []
texts = []

# read full dataset file
with open("articles.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
for row in reader:
labels.append(row[0])
texts.append(row[1])

# split dataset
trn_texts, tst_texts, trn_labels, tst_labels = train_test_split(texts, labels, test_size=SPLIT_PERCENTAGE, random_state=42, stratify=labels)

# write train and test datasets
train = []
test = []

for i in range(len(trn_labels)):
train.append([trn_labels[i], trn_texts[i]])

for i in range(len(tst_labels)):
test.append([tst_labels[i], tst_texts[i]])

write_datasets(train, "train")
write_datasets(test, "test")


3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sklearn
BeautifulSoup
tqdm
Loading

0 comments on commit 39368ef

Please sign in to comment.