Skip to content

Commit

Permalink
refactor: update scripts to store downloaded data in a subfolder (#8)
Browse files Browse the repository at this point in the history
this includes a new .gitignore file, and a change to the old one

# Description

This PR will create a downloaded folder to store the original files
(both the zipped file and the unzipped data files).

Closes #9

<!-- Select quick/in-depth as necessary -->
This PR needs a quick review.

## Checklist

- [x] Read through for typos, added new words to the dictionary
- [x] Checked that the README is up to date
- [x] Resolved any Ruff errors / formatted in Markdown
  • Loading branch information
K-Beicher authored Mar 5, 2025
1 parent c791f64 commit 7c817a3
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 6 deletions.
1 change: 1 addition & 0 deletions data-raw/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*
!.gitignore
!downloaded/
2 changes: 2 additions & 0 deletions data-raw/downloaded/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
9 changes: 5 additions & 4 deletions scripts/convert-with-core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

resource_dir = Path(__file__).resolve().parent.parent
folder_path = resource_dir / "data-raw"
downloaded_path = folder_path / "downloaded"

df_experiment1 = pl.read_csv(folder_path / "Expe1.csv", infer_schema_length=100_000)
df_experiment1 = pl.read_csv(downloaded_path / "Expe1.csv", infer_schema_length=100_000)

df_experiment1 = df_experiment1.rename(
{
Expand All @@ -24,7 +25,7 @@
)
df_experiment1.write_csv(folder_path / "data-experiment1-ready.csv")

df_experiment2 = pl.read_csv(folder_path / "Expe2.csv", infer_schema_length=100_000)
df_experiment2 = pl.read_csv(downloaded_path / "Expe2.csv", infer_schema_length=100_000)

treatment_mapping1 = {
"masHmasE": "with_feces_cover_after_48h",
Expand All @@ -51,7 +52,7 @@
df_experiment2.write_csv(folder_path / "data-experiment2-ready.csv")

df_abundance = pl.read_csv(
folder_path / "Expe3Abundance.csv", infer_schema_length=100_000
downloaded_path / "Expe3Abundance.csv", infer_schema_length=100_000
)

treatment_mapping2 = {
Expand All @@ -66,7 +67,7 @@
)

df_richness = pl.read_csv(
folder_path / "Expe3Richness.csv", infer_schema_length=100_000
downloaded_path / "Expe3Richness.csv", infer_schema_length=100_000
)

df_experiment3_all = pl.concat([df_abundance, df_richness], how="align") # concat wide
Expand Down
5 changes: 3 additions & 2 deletions scripts/download-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@

resource_dir = Path(__file__).resolve().parent.parent
folder_path = resource_dir / "data-raw"
download_path = folder_path / "downloaded"

# Download and save the zip file
all_files = requests.get("https://zenodo.org/api/records/4965431/files-archive")

all_files_path = folder_path / "all_files.zip"
all_files_path = download_path / "all_files.zip"
with open(all_files_path, "wb") as file:
file.write(all_files.content)

# Extract the zip file
with ZipFile(all_files_path, "r") as zip_ref:
zip_ref.extractall(folder_path)
zip_ref.extractall(download_path)

0 comments on commit 7c817a3

Please sign in to comment.