Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
eliorav committed Apr 6, 2020
1 parent 7141d9a commit 8ae3b3a
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 2 deletions.
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ disable=print-statement,
arguments-differ,
unused-argument,
missing-module-docstring,
too-few-public-methods
too-few-public-methods,
multiple-statements

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,34 @@
# python-template
<p align="center">
<h3 align="center">1000G Downloader</h3>

<p align="center">
Download sample from 1000G project.
</p>
</p>

## Getting Started
### Installation

```sh
pip install one_thousand_genomes_downloader
```

### Usage example

```py
downloader = OneThousandGenomesDownloader()
downloader('HG00102')
```

## Documentation
### Class OneThousandGenomesDownloader
> OneThousandGenomesDownloader()
* file_format: the file format of the sample: bam or cram
* use_mapped_file: if to download the mapped or the unmapped file
* use_exome_alignment: if to download from the exome alignment folder
* on_download_finish: a callback function
* output_folder: output folder
## Contact
Elior Avraham – elior.av@gmail.com

1 change: 1 addition & 0 deletions one_thousand_genomes_downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from one_thousand_genomes_downloader.utilities import OneThousandGenomesDownloader
7 changes: 7 additions & 0 deletions one_thousand_genomes_downloader/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FTP_PATH = 'ftp.1000genomes.ebi.ac.uk'
BASE_PATH = '/vol1/ftp/phase3/data'
EXOME_ALIGNMENT_DIR = 'exome_alignment'
ALIGNMENT_DIR = 'alignment'
DEFAULT_FILE_FORMAT = 'bam'
OUTPUT_FOLDER = 'samples'
FORMATS = dict(bam=('bam', 'bai'), cram=('cram', 'crai'))
76 changes: 76 additions & 0 deletions one_thousand_genomes_downloader/utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import sys
from ftplib import FTP
from tqdm import tqdm
from one_thousand_genomes_downloader.constants import BASE_PATH, ALIGNMENT_DIR, FTP_PATH, FORMATS, OUTPUT_FOLDER, \
DEFAULT_FILE_FORMAT, EXOME_ALIGNMENT_DIR


def get_sample_folder(sample, use_exome_alignment):
"""get sample folder"""
alignment_dir = EXOME_ALIGNMENT_DIR if use_exome_alignment else ALIGNMENT_DIR
return f"{BASE_PATH}/{sample}/{alignment_dir}"


def mkdir_p(name):
"""create dir if not exist"""
if not os.path.exists(name):
os.makedirs(name)


class OneThousandGenomesDownloader:
"""
FTP Downloader class
"""

def __init__(self, file_format=DEFAULT_FILE_FORMAT, use_mapped_file=True, use_exome_alignment=True,
on_download_finish=None, output_folder=None):
"""
:param file_format: the file format of the sample: bam or cram
:param use_mapped_file: if to download the mapped or the unmapped file
:param use_exome_alignment: if to download from the exome alignment folder
:param on_download_finish: a callback function
:param output_folder: output folder
"""
self.file_format = DEFAULT_FILE_FORMAT
self.set_file_format(file_format)
self.file_mapped = 'mapped' if use_mapped_file else 'unmapped'
self.on_download_finish = on_download_finish
self.output_folder = output_folder if output_folder is not None else OUTPUT_FOLDER
self.use_exome_alignment = use_exome_alignment

def __call__(self, sample):
"""
:param sample: sample name
"""
ftp = FTP(FTP_PATH)
ftp.login()
ftp.cwd(get_sample_folder(sample, self.use_exome_alignment))
files = [f for f in ftp.nlst() if
f'.{self.file_mapped}.' in f and (f.split(".")[-1] in FORMATS[self.file_format])]
out_folder = f"{self.output_folder}/{sample}"
mkdir_p(out_folder)
for idx, sample_file in enumerate(files):
with open(f'{out_folder}/{sample_file}', 'wb') as out_file:
with tqdm(total=ftp.size(sample_file),
unit_scale=True,
desc=f"{idx} - {sample_file}",
miniters=1,
file=sys.stdout,
leave=True
) as pbar:
def callback(data):
pbar.update(len(data))
out_file.write(data)

ftp.retrbinary('RETR {}'.format(sample_file), callback)
ftp.quit()
if self.on_download_finish is not None:
self.on_download_finish(sample)

def set_file_format(self, file_format):
"""Set file format"""
if file_format in FORMATS.keys():
self.file_format = file_format
else:
raise Exception(f"the file format {file_format} is not supported.")
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
tqdm
pylint

0 comments on commit 8ae3b3a

Please sign in to comment.