From 7a799b2f59b27e684f716d6f3f1be4c29747bf9f Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Tue, 18 Jun 2024 05:49:58 -0600 Subject: [PATCH] Corpus --- quarto/dialectid.qmd | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/quarto/dialectid.qmd b/quarto/dialectid.qmd index 7b16836..c8efc3e 100644 --- a/quarto/dialectid.qmd +++ b/quarto/dialectid.qmd @@ -137,20 +137,31 @@ from dialectid.utils import COUNTRIES, BASEURL def corpus_size(lang): + data = [] + index = [] + for day, d in tweet_iterator(f'stats-{lang}.json.gz'): + day = pd.to_datetime(day) + data.append(d) + index.append(day) + df2 = pd.DataFrame(data, index=index) + train = next(tweet_iterator(f'stats-{lang}-train.json')) test = next(tweet_iterator(f'stats-{lang}-test.json')) df = pd.DataFrame([train, test], index=['Train', 'Test']) df.columns.name = 'Countries' + df.loc['All'] = df2.sum(axis=0) columns = COUNTRIES[lang] + df = df.reindex(['All', 'Train', 'Test']) _ = df[columns].T.sort_values('Train', ascending=False) return Markdown(_.to_markdown()) for lang in COUNTRIES: - if isfile(f'stats-{lang}-train.json'): + if isfile(f'stats-{lang}.json.gz'): continue Download(f'{BASEURL}/stats-{lang}-train.json', f'stats-{lang}-train.json') - Download(f'{BASEURL}/stats-{lang}-test.json', f'stats-{lang}-test.json') + Download(f'{BASEURL}/stats-{lang}-test.json', f'stats-{lang}-test.json') + Download(f'{BASEURL}/stats-{lang}.json.gz', f'stats-{lang}.json.gz') ``` ## Column {.tabset}