From 8af8042cd9d96738b5a68a4215596887cef3d029 Mon Sep 17 00:00:00 2001 From: Kris Beicher <112945740+K-Beicher@users.noreply.github.com> Date: Tue, 4 Mar 2025 10:53:39 +0100 Subject: [PATCH] feat: update scripts to store downloaded data in a subfolder this includes a new .gitignore file, and a change to the old one --- data-raw/.gitignore | 1 + data-raw/downloaded/.gitignore | 2 ++ scripts/convert-with-core.py | 9 +++++---- scripts/download-data.py | 5 +++-- 4 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 data-raw/downloaded/.gitignore diff --git a/data-raw/.gitignore b/data-raw/.gitignore index d6b7ef3..2c2d5e0 100644 --- a/data-raw/.gitignore +++ b/data-raw/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!downloaded/ diff --git a/data-raw/downloaded/.gitignore b/data-raw/downloaded/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/data-raw/downloaded/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/scripts/convert-with-core.py b/scripts/convert-with-core.py index ba01e30..6abfe3c 100644 --- a/scripts/convert-with-core.py +++ b/scripts/convert-with-core.py @@ -5,8 +5,9 @@ resource_dir = Path(__file__).resolve().parent.parent folder_path = resource_dir / "data-raw" +downloaded_path = folder_path / "downloaded" -df_experiment1 = pl.read_csv(folder_path / "Expe1.csv", infer_schema_length=100_000) +df_experiment1 = pl.read_csv(downloaded_path / "Expe1.csv", infer_schema_length=100_000) df_experiment1 = df_experiment1.rename( { @@ -24,7 +25,7 @@ ) df_experiment1.write_csv(folder_path / "data-experiment1-ready.csv") -df_experiment2 = pl.read_csv(folder_path / "Expe2.csv", infer_schema_length=100_000) +df_experiment2 = pl.read_csv(downloaded_path / "Expe2.csv", infer_schema_length=100_000) treatment_mapping1 = { "masHmasE": "with_feces_cover_after_48h", @@ -51,7 +52,7 @@ df_experiment2.write_csv(folder_path / "data-experiment2-ready.csv") df_abundance = pl.read_csv( - folder_path / "Expe3Abundance.csv", infer_schema_length=100_000 + downloaded_path / "Expe3Abundance.csv", infer_schema_length=100_000 ) treatment_mapping2 = { @@ -66,7 +67,7 @@ ) df_richness = pl.read_csv( - folder_path / "Expe3Richness.csv", infer_schema_length=100_000 + downloaded_path / "Expe3Richness.csv", infer_schema_length=100_000 ) df_experiment3_all = pl.concat([df_abundance, df_richness], how="align") # concat wide diff --git a/scripts/download-data.py b/scripts/download-data.py index bbfe5f4..d8aa595 100644 --- a/scripts/download-data.py +++ b/scripts/download-data.py @@ -5,14 +5,15 @@ resource_dir = Path(__file__).resolve().parent.parent folder_path = resource_dir / "data-raw" +download_path = folder_path / "downloaded" # Download and save the zip file all_files = requests.get("https://zenodo.org/api/records/4965431/files-archive") -all_files_path = folder_path / "all_files.zip" +all_files_path = download_path / "all_files.zip" with open(all_files_path, "wb") as file: file.write(all_files.content) # Extract the zip file with ZipFile(all_files_path, "r") as zip_ref: - zip_ref.extractall(folder_path) + zip_ref.extractall(download_path)