diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml new file mode 100644 index 00000000..1d929ee4 --- /dev/null +++ b/.github/workflows/conda-build.yml @@ -0,0 +1,60 @@ +name: Conda Build + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + channels: conda-forge,defaults,bioconda + + - name: Create conda environment + run: conda env create -f environment.yml + + - name: Activate conda environment + run: | + source $CONDA/etc/profile.d/conda.sh + conda activate sdrf-pipelines + shell: bash + + - name: Install conda-build + run: | + source $CONDA/etc/profile.d/conda.sh + conda activate sdrf-pipelines + conda install -n sdrf-pipelines conda-build anaconda-client + shell: bash -l {0} + + - name: Activate conda environment and build package + run: | + source $CONDA/etc/profile.d/conda.sh + conda activate sdrf-pipelines + conda build recipe + shell: bash -l {0} + + - name: Install the built package + run: | + source $CONDA/etc/profile.d/conda.sh + conda activate sdrf-pipelines + conda install --use-local sdrf-pipelines + shell: bash -l {0} + + - name: Test the installed package + run: | + conda activate sdrf-pipelines + parse_sdrf --help + shell: bash -l {0} + + - name: Test other commands + run: | + conda activate sdrf-pipelines + parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --check_ms + shell: bash -l {0} diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..18d08193 --- /dev/null +++ b/environment.yml @@ -0,0 +1,20 @@ +name: sdrf-pipelines +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.8 + - conda-build + - anaconda-client + - pandas + - click + - requests + - pyyaml + - numpy + - defusedxml + - pyarrow + - python-duckdb + - rdflib + - pandas_schema + diff --git a/recipe/meta.yaml b/recipe/meta.yaml new file mode 100644 index 00000000..3f2827c8 --- /dev/null +++ b/recipe/meta.yaml @@ -0,0 +1,49 @@ +# recipe/meta.yaml +package: + name: sdrf-pipelines + version: "0.0.27" + +source: + path: ../ + +build: + noarch: python + entry_points: + - parse_sdrf = sdrf_pipelines.parse_sdrf:main + script: + - {{ PYTHON }} -m pip install --no-deps --ignore-installed . + +requirements: + host: + - pip + - python >=3.5 + run: + - click + - requests + - pandas + - pandas_schema + - python >=3.5 + - pyaml + - defusedxml + - pytest + - duckdb + - rdflib + - pyarrow + +test: + imports: + - sdrf_pipelines + - sdrf_pipelines.openms + - sdrf_pipelines.sdrf + - sdrf_pipelines.utils + - sdrf_pipelines.ols + commands: + - parse_sdrf --help + +about: + home: "https://github.com/bigbio/sdrf-pipelines" + license: Apache 2 + ##license_file: LICENSE ## patch is applied upstream, next version will have it + summary: "Translate, convert SDRF to configuration pipelines" + doc_url: "https://github.com/bigbio/sdrf-pipelines" + dev_url: "https://github.com/bigbio/sdrf-pipelines" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2bb73b84..e503651b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,7 @@ pandas_schema requests pyyaml numpy -defusedxml \ No newline at end of file +defusedxml +pyarrow +duckdb +rdflib \ No newline at end of file diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index 15331024..573b11cd 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.26" +__version__ = "0.0.27" diff --git a/sdrf_pipelines/zooma/__init__.py b/sdrf_pipelines/ols/__init__.py similarity index 100% rename from sdrf_pipelines/zooma/__init__.py rename to sdrf_pipelines/ols/__init__.py diff --git a/sdrf_pipelines/ols/bto.parquet b/sdrf_pipelines/ols/bto.parquet new file mode 100644 index 00000000..c9ab5c9a Binary files /dev/null and b/sdrf_pipelines/ols/bto.parquet differ diff --git a/sdrf_pipelines/ols/chebi.parquet b/sdrf_pipelines/ols/chebi.parquet new file mode 100644 index 00000000..b65f2798 Binary files /dev/null and b/sdrf_pipelines/ols/chebi.parquet differ diff --git a/sdrf_pipelines/ols/cl.parquet b/sdrf_pipelines/ols/cl.parquet new file mode 100644 index 00000000..8df93e8d Binary files /dev/null and b/sdrf_pipelines/ols/cl.parquet differ diff --git a/sdrf_pipelines/ols/clo.parquet b/sdrf_pipelines/ols/clo.parquet new file mode 100644 index 00000000..0db33344 Binary files /dev/null and b/sdrf_pipelines/ols/clo.parquet differ diff --git a/sdrf_pipelines/ols/efo-base.parquet b/sdrf_pipelines/ols/efo-base.parquet new file mode 100644 index 00000000..74135b5b Binary files /dev/null and b/sdrf_pipelines/ols/efo-base.parquet differ diff --git a/sdrf_pipelines/ols/mondo.parquet b/sdrf_pipelines/ols/mondo.parquet new file mode 100644 index 00000000..54f890d0 Binary files /dev/null and b/sdrf_pipelines/ols/mondo.parquet differ diff --git a/sdrf_pipelines/ols/ncbitaxon.parquet b/sdrf_pipelines/ols/ncbitaxon.parquet new file mode 100644 index 00000000..5e2f9821 Binary files /dev/null and b/sdrf_pipelines/ols/ncbitaxon.parquet differ diff --git a/sdrf_pipelines/ols/ncit.parquet b/sdrf_pipelines/ols/ncit.parquet new file mode 100644 index 00000000..eb0c25ff Binary files /dev/null and b/sdrf_pipelines/ols/ncit.parquet differ diff --git a/sdrf_pipelines/ols/ols.py b/sdrf_pipelines/ols/ols.py new file mode 100644 index 00000000..c7666f85 --- /dev/null +++ b/sdrf_pipelines/ols/ols.py @@ -0,0 +1,428 @@ +""" +OLS API wrapper + +Original code borrowed from + https://github.com/cthoyt/ols-client/blob/master/src/ols_client/client.py + +- Removed ontology and term methods. +- Added details/parameters for all search methods + +TODO: check input parameters are valid +TODO: handle requests.exceptions.ConnectionError when traffic is too high and API goes down +""" + +import glob +import logging +import os.path +import urllib.parse + +import duckdb +import pandas as pd +import pkg_resources +import rdflib +import requests + +OLS = "https://www.ebi.ac.uk/ols4" + +__all__ = ["OlsClient"] + +logger = logging.getLogger(__name__) + +API_SUGGEST = "/api/suggest" +API_SEARCH = "/api/search" +API_SELECT = "/api/select" +API_TERM = "/api/ontologies/{ontology}/terms/{iri}" +API_ANCESTORS = "/api/ontologies/{ontology}/terms/{iri}/ancestors" +API_PROPERTIES = "/api/ontologies/{ontology}/properties?lang=en" + + +def _concat_str_or_list(input_str): + """ + Always returns a comma joined list, whether the input is a + single string or an iterable + @:param input_str String to join + """ + + if isinstance(input_str, str): + return input_str + + return ",".join(input_str) + + +def _dparse(iri): + """ + Double url encode the IRI, which is required + @:param iri in the OLS + """ + return urllib.parse.quote_plus(urllib.parse.quote_plus(iri)) + + +class OlsTerm: + def __init__(self, iri: str = None, term: str = None, ontology: str = None) -> None: + self._iri = iri + self._term = term + self._ontology = ontology + + def __str__(self) -> str: + return f"{self._term} -- {self._ontology} -- {self._iri}" + + +def get_cache_parquet_files(): + """ + This function returns a list of parquet files in the cache directory. + """ + parquet_files_pattern = pkg_resources.resource_filename(__name__, "*.parquet") + parquet_files = glob.glob(parquet_files_pattern) + + if not parquet_files: + logger.info("No parquet files found in %s", parquet_files_pattern) + return parquet_files_pattern, [] + + # select from all the parquets the ontology names and return a list of the unique ones + # use for reading all the parquets the duckdb library. + df = duckdb.execute("""SELECT DISTINCT ontology FROM read_parquet(?)""", (parquet_files,)).fetchdf() + + if df is None or df.empty: + return parquet_files, [] + + ontologies = df.ontology.unique().tolist() + return parquet_files, ontologies + + +def get_obo_accession(uri): + # Example: Convert 'http://www.ebi.ac.uk/efo/EFO_0000001' to 'EFO:0000001' + try: + if "#" in uri: + fragment = uri.split("#")[-1] + else: + fragment = uri.split("/")[-1] + + prefix, identifier = fragment.split("_") + return f"{prefix}:{identifier}" + except Exception as ex: + logger.error("Error converting URI %s to OBO accession: %s", uri, ex) + + return None + + +def read_owl_file(ontology_file, ontology_name=None): + """ + Reads an OWL file and returns a list of OlsTerms + @:param ontology_file: The name of the ontology + @:param ontology_name: The name of the ontology + """ + g = rdflib.Graph() + g.parse(ontology_file, format="xml") + terms_info = [] + + for s, _, _ in g.triples((None, rdflib.RDF.type, rdflib.OWL.Class)): + term_id = str(s) + for _, _, name in g.triples((s, rdflib.RDFS.label, None)): + term_name = str(name) + terms_info.append({"accession": get_obo_accession(term_id), "label": term_name, "ontology": ontology_name}) + + # remove terms with no label or accession + terms_info = [term for term in terms_info if "label" in term and "accession" in term] + return terms_info + + +def read_obo_file(ontology_file, ontology_name=None): + """ + Reads an OBO file and returns a list of OlsTerms + @:param ontology_file: The name of the ontology + @:param ontology_name: The name of the ontology + """ + + def split_terms(content): + terms = content.split("[Term]")[1:] # Skip the header and split by [Term] + return terms + + def get_ontology_name(content): + lines = content.split("\n") + for line in lines: + if line.startswith("ontology:"): + return line.split("ontology:")[1].strip() + return None + + def parse_term(term, ontology_name): + term_info = {} + lines = term.strip().split("\n") + for line in lines: + if line.startswith("id:"): + term_info["accession"] = line.split("id:")[1].strip() + term_info["ontology"] = ontology_name + elif line.startswith("name:"): + term_info["label"] = line.split("name:")[1].strip() + return term_info + + with open(ontology_file, "r") as file: + content = file.read() + + terms = split_terms(content) + ontology_name = get_ontology_name(content) if ontology_name is None else ontology_name + terms_info = [parse_term(term, ontology_name) for term in terms] + + return terms_info + + +class OlsClient: + def __init__(self, ols_base=None, ontology=None, field_list=None, query_fields=None, use_cache=True): + """ + @:param ols_base: The base URL for the OLS + @:param ontology: The name of the ontology + @:param field_list: The list of fields to return + @:param query_fields: The list of fields to query + @:param use_cache: Whether to use cache which are local files with the same terms + """ + self.base = (ols_base if ols_base else OLS).rstrip("/") + self.session = requests.Session() + + self.ontology = ontology if ontology else None + self.field_list = field_list if field_list else None + self.query_fields = query_fields if query_fields else None + + self.ontology_suggest = self.base + API_SUGGEST + self.ontology_select = self.base + API_SELECT + self.ontology_search = self.base + API_SEARCH + self.ontology_term = self.base + API_TERM + self.ontology_ancestors = self.base + API_ANCESTORS + + if use_cache: + self.use_cache = use_cache + parquet_ontologies, ontologies = get_cache_parquet_files() + if len(parquet_ontologies) == 0: + self.use_cache = False + else: + self.parquet_files = parquet_ontologies + self.ontologies = ontologies + else: + self.use_cache = False + + @staticmethod + def build_ontology_index(ontology_file: str, output_file: str = None, ontology_name: str = None): + """ + Builds an index of an ontology file OBO format. The output file will be a parquet file containing only three columns: + - the accession of the term in the form of ONTOLOGY:NUMBER (e.g. GO:0000001) the name of the term and the number. + - The name of the term. + - The ontology in which the term is found (e.g. GO). + All information should be in lower case and also the file will be compressed. + @:param ontology_file: The name of the ontology + @:param output_file: The name of the output file + @:param ontology_name: The name of the ontology + """ + + if ontology_file is None or not os.path.isfile(ontology_file): + raise ValueError(f"File {ontology_file} is None or does not exist") + + # check an extension of the ontology file + owl_file = False + if not ontology_file.lower().endswith(".obo"): + owl_file = True + if ontology_name is None: + raise ValueError("Ontology name is required for OWL files") + + if output_file is None or not output_file.lower().endswith(".parquet"): + output_file = os.path.splitext(ontology_file)[0] + ".parquet" + + logger.info("Building index of %s", ontology_file) + + if owl_file: + terms = read_owl_file(ontology_file, ontology_name=ontology_name) + terms = [term for term in terms if "label" in term] + df = pd.DataFrame(terms) + else: + terms = read_obo_file(ontology_file, ontology_name=ontology_name) + # remove terms with no label + terms = [term for term in terms if "label" in term] + df = pd.DataFrame(terms) + + df.to_parquet(output_file, compression="gzip") + logger.info("Index has finished, output file: %s", output_file) + + def besthit(self, name, **kwargs): + """ + select a first element of the /search API response + """ + search_resp = self.search(name, **kwargs) + if search_resp: + return search_resp[0] + + return None + + def get_term(self, ontology, iri): + """ + Gets the data for a given term + Args: + ontology: The name of the ontology + iri: The IRI of a term + """ + + url = self.ontology_term.format(ontology=ontology, iri=_dparse(iri)) + response = self.session.get(url) + return response.json() + + def get_ancestors(self, ont, iri): + """ + Gets the data for a given term + @param ont: The name of the ontology + @param iri:The IRI of a term + """ + url = self.ontology_ancestors.format(ontology=ont, iri=_dparse(iri)) + response = self.session.get(url) + try: + return response.json()["_embedded"]["terms"] + except KeyError as ex: + logger.warning("Term was found but ancestor lookup returned an empty response: %s", response.json()) + raise ex + + def search(self, term: str, ontology: str, exact=True, **kwargs): + """ + Search a term in the OLS + @:param term: The name of the term + @:param ontology: The name of the ontology + @:param exact: Forces exact match if not `None` + """ + terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs) + if terms is None and self.use_cache: + terms = self.cache_search(term, ontology) + return terms + + def _perform_ols_search(self, params, name, exact, retry_num=0): + try: + req = self.session.get(self.ontology_search, params=params) + logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code) + + if req.status_code != 200: + logger.error("OLS search term %s error, retry number %s", name, retry_num) + req.raise_for_status() + + response_json = req.json() + num_found = response_json["response"]["numFound"] + docs = response_json["response"]["docs"] + + if num_found == 0: + logger.debug("OLS %s search returned empty response for %s", "exact" if exact else "", name) + return [] + + return docs + except Exception as ex: + logger.exception("OLS error searching term %s. Error: %s", name, ex) + + def ols_search( + self, + name: str, + query_fields=None, + ontology: str = None, + field_list=None, + children_of=None, + exact: bool = None, + bytype: str = "class", + rows: int = 10, + num_retries: int = 10, + start: int = 0, + ): + params = {"q": name, "type": _concat_str_or_list(bytype), "rows": rows, "start": start} + if ontology: + params["ontology"] = _concat_str_or_list(ontology.lower()) + elif self.ontology: + params["ontology"] = _concat_str_or_list(self.ontology) + + if exact: + params["exact"] = "on" + + if query_fields: + params["queryFields"] = _concat_str_or_list(query_fields) + elif self.query_fields: + params["queryFields"] = _concat_str_or_list(self.query_fields) + + if field_list: + params["fieldList"] = _concat_str_or_list(field_list) + elif self.field_list: + params["fieldList"] = _concat_str_or_list(self.field_list) + + if children_of: + params["childrenOf"] = _concat_str_or_list(children_of) + + docs_found = [] + + for retry_num in range(num_retries): + docs = self._perform_ols_search(params, name=name, exact=exact, retry_num=retry_num) + if docs: + docs_found.extend(docs) + if len(docs) < rows: + return docs_found + + start += rows + params["start"] = start + + return docs_found + + def suggest(self, name, ontology=None): + """Suggest terms from an optional list of ontologies + + .. seealso:: https://www.ebi.ac.uk/ols/docs/api#_suggest_term + """ + params = {"q": name} + if ontology: + params["ontology"] = ",".join(ontology) + response = self.session.get(self.ontology_suggest, params=params) + response.raise_for_status() + + if response.json()["response"]["numFound"]: + return response.json()["response"]["docs"] + logger.debug("OLS suggest returned empty response for %s", name) + return None + + def select(self, name, ontology=None, field_list=None): + """Select terms, + Tuned specifically to support applications such as autocomplete. + + .. see also:: https://www.ebi.ac.uk/ols4/docs/api#_select + """ + params = {"q": name} + if ontology: + params["ontology"] = ",".join(ontology) + if field_list: + params["fieldList"] = ",".join(field_list) + response = self.session.get(self.ontology_select, params=params) + response.raise_for_status() + + if response.json()["response"]["numFound"]: + return response.json()["response"]["docs"] + logger.debug("OLS select returned empty response for %s", name) + return None + + def cache_search(self, term: str, ontology: str, full_search: bool = False) -> list: + """ + Search a term in cache files and return them as list. + @param term: The name of the term + @param ontology: The name of the ontology + """ + is_cached = False + if ontology is not None: + for cache_ontologies in self.ontologies: + if cache_ontologies.lower() == ontology.lower(): + is_cached = True + break + if not is_cached and not full_search: + return [] + + if ontology is not None: + duckdb_conn = duckdb.execute( + """SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?) AND lower(ontology) = lower(?)""", + (self.parquet_files, term, ontology), + ) + else: + duckdb_conn = duckdb.execute( + """SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?)""", (self.parquet_files, term) + ) + df = duckdb_conn.fetchdf() + + if df is None or df.empty: + return [] + + terms = [] + for _, row in df.iterrows(): + terms.append({"ontology_name": row.ontology, "label": row.label, "obo_id": row.accession}) + + return terms diff --git a/sdrf_pipelines/ols/pato.parquet b/sdrf_pipelines/ols/pato.parquet new file mode 100644 index 00000000..58eeb404 Binary files /dev/null and b/sdrf_pipelines/ols/pato.parquet differ diff --git a/sdrf_pipelines/ols/pride.parquet b/sdrf_pipelines/ols/pride.parquet new file mode 100644 index 00000000..77305f0d Binary files /dev/null and b/sdrf_pipelines/ols/pride.parquet differ diff --git a/sdrf_pipelines/ols/psi-ms.parquet b/sdrf_pipelines/ols/psi-ms.parquet new file mode 100644 index 00000000..32d07425 Binary files /dev/null and b/sdrf_pipelines/ols/psi-ms.parquet differ diff --git a/sdrf_pipelines/ols/uberon.parquet b/sdrf_pipelines/ols/uberon.parquet new file mode 100644 index 00000000..e54f3950 Binary files /dev/null and b/sdrf_pipelines/ols/uberon.parquet differ diff --git a/sdrf_pipelines/parse_sdrf.py b/sdrf_pipelines/parse_sdrf.py index 741fffdd..c6079a91 100755 --- a/sdrf_pipelines/parse_sdrf.py +++ b/sdrf_pipelines/parse_sdrf.py @@ -13,6 +13,7 @@ from sdrf_pipelines.maxquant.maxquant import Maxquant from sdrf_pipelines.msstats.msstats import Msstats from sdrf_pipelines.normalyzerde.normalyzerde import NormalyzerDE +from sdrf_pipelines.ols.ols import OlsClient from sdrf_pipelines.openms.openms import OpenMS from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame from sdrf_pipelines.sdrf.sdrf_schema import ALL_TEMPLATES @@ -226,12 +227,27 @@ def normalyzerde_from_sdrf(ctx, sdrf, conditionsfromcolumns, outpath, outpathcom ) +@click.command("build-index-ontology", short_help="Convert an ontology file to an index file") +@click.option("--ontology", "-in", help="ontology file") +@click.option("--index", "-out", help="Output file in parquet format") +@click.option("--ontology_name", "-name", help="ontology name") +@click.pass_context +def build_index_ontology(ctx, ontology: str, index: str, ontology_name: str = None): + ols_client = OlsClient() + + if ontology.lower().endswith(".owl") and ontology_name is None: + raise ValueError("Please provide the ontology name for the owl file") + + ols_client.build_ontology_index(ontology, index, ontology_name) + + cli.add_command(validate_sdrf) cli.add_command(openms_from_sdrf) cli.add_command(maxquant_from_sdrf) cli.add_command(split_sdrf) cli.add_command(msstats_from_sdrf) cli.add_command(normalyzerde_from_sdrf) +cli.add_command(build_index_ontology) def main(): diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index d37fa849..381ce846 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -13,9 +13,9 @@ from pandas_schema.validation import _BaseValidation from pandas_schema.validation import _SeriesValidation +from sdrf_pipelines.ols.ols import OlsClient from sdrf_pipelines.sdrf import sdrf from sdrf_pipelines.utils.exceptions import LogicError -from sdrf_pipelines.zooma.ols import OlsClient client = OlsClient() diff --git a/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py b/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py index 297913b4..c624e38d 100644 --- a/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py @@ -4,9 +4,9 @@ import pandas as pd import yaml +from sdrf_pipelines.ols.ols import OlsClient from sdrf_pipelines.openms.unimod import UnimodDatabase from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame -from sdrf_pipelines.zooma.zooma import OlsClient # Accessing ontologies and CVs unimod = UnimodDatabase() diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py deleted file mode 100644 index a19a890e..00000000 --- a/sdrf_pipelines/zooma/ols.py +++ /dev/null @@ -1,259 +0,0 @@ -""" -OLS API wrapper - -Original code borrowed from - https://github.com/cthoyt/ols-client/blob/master/src/ols_client/client.py - -- Removed ontology and term methods. -- Added details/parameters for all search methods - -TODO: check input parameters are valid -TODO: handle requests.exceptions.ConnectionError when traffic is too high and API goes down -""" - -import logging -import urllib.parse - -import requests - -OLS = "https://www.ebi.ac.uk/ols4" - -__all__ = ["OlsClient"] - -logger = logging.getLogger(__name__) - -API_SUGGEST = "/api/suggest" -API_SEARCH = "/api/search" -API_SELECT = "/api/select" -API_TERM = "/api/ontologies/{ontology}/terms/{iri}" -API_ANCESTORS = "/api/ontologies/{ontology}/terms/{iri}/ancestors" - - -def _concat_str_or_list(input_str): - """ - Always returns a comma joined list, whether the input is a - single string or an iterable - @:param input_str String to join - """ - - if type(input_str) is str: - return input_str - - return ",".join(input_str) - - -def _dparse(iri): - """ - Double url encode the IRI, which is required - @:param iri in the OLS - """ - return urllib.parse.quote_plus(urllib.parse.quote_plus(iri)) - - -class OlsClient: - def __init__(self, ols_base=None, ontology=None, field_list=None, query_fields=None): - """ - :param ols_base: An optional, custom URL for the OLS RESTful API. - """ - self.base = (ols_base if ols_base else OLS).rstrip("/") - self.session = requests.Session() - - self.ontology = ontology if ontology else None - self.field_list = field_list if field_list else None - self.query_fields = query_fields if query_fields else None - - self.ontology_suggest = self.base + API_SUGGEST - self.ontology_select = self.base + API_SELECT - self.ontology_search = self.base + API_SEARCH - self.ontology_term = self.base + API_TERM - self.ontology_ancestors = self.base + API_ANCESTORS - - def besthit(self, name, **kwargs): - """ - select a first element of the /search API response - """ - search_resp = self.search(name, **kwargs) - if search_resp: - return search_resp[0] - - return None - - def get_term(self, ontology, iri): - """ - Gets the data for a given term - Args: - ontology: The name of the ontology - iri: The IRI of a term - """ - - url = self.ontology_term.format(ontology=ontology, iri=_dparse(iri)) - response = self.session.get(url) - return response.json() - - def get_ancestors(self, ont, iri): - """ - Gets the data for a given term - @param ont: The name of the ontology - @param iri:The IRI of a term - """ - url = self.ontology_ancestors.format(ontology=ont, iri=_dparse(iri)) - response = self.session.get(url) - try: - return response.json()["_embedded"]["terms"] - except KeyError as ex: - logger.warning("Term was found but ancestor lookup returned an empty response: %s", response.json()) - raise ex - - def search( - self, - name: str, - query_fields=None, - ontology: str = None, - field_list=None, - children_of=None, - exact: bool = None, - bytype: str = "class", - rows: int = 10, - num_retries: int = 10, - start: int = 0, - ): - """ - Searches the OLS with the given term - - @:param query_fields: By default, the search is performed over term labels, - synonyms, descriptions, identifiers and annotation properties. This option allows - to specify the fields to query, the defaults are - `{label, synonym, description, short_form, obo_id, annotations, logical_description, iri}` - @:param exact: Forces exact match if not `None` - @:param bytype: restrict to terms one of {class,property,individual,ontology} - @:param childrenOf: Search only under a certain term. - @:param rows: number of rows to query on each call of OLS search - @:param num_retries: Number of retries to OLS when it fails. - """ - params = {"q": name} - if ontology is not None: - ontology = ontology.lower() - - if exact: - params["exact"] = "on" - - if bytype: - params["type"] = _concat_str_or_list(bytype) - - if rows: - params["rows"] = rows - - if ontology: - params["ontology"] = _concat_str_or_list(ontology) - elif self.ontology: - params["ontology"] = _concat_str_or_list(self.ontology) - - if query_fields: - params["queryFields"] = _concat_str_or_list(query_fields) - elif self.query_fields: - params["queryFields"] = _concat_str_or_list(self.query_fields) - - if field_list: - params["fieldList"] = _concat_str_or_list(field_list) - elif self.field_list: - params["fieldList"] = _concat_str_or_list(self.field_list) - - if children_of is None: - children_of = [] - if len(children_of) > 0: - params["childrenOf"] = _concat_str_or_list(children_of) - - if start: - params["start"] = start - - docs_found = [] - - for retry_num in range(num_retries): - try: - req = self.session.get(self.ontology_search, params=params) - logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code) - - if req.status_code != 200: - logger.error("OLS search term %s error tried number %s", name, retry_num) - req.raise_for_status() - else: - if req.json()["response"]["numFound"] == 0: - if exact: - logger.debug("OLS exact search returned empty response for %s", name) - else: - logger.debug("OLS search returned empty response for %s", name) - return docs_found - elif len(req.json()["response"]["docs"]) < rows: - return req.json()["response"]["docs"] - else: - docs_found = req.json()["response"]["docs"] - docs_found.extend( - self.search( - name, - query_fields=query_fields, - ontology=ontology, - field_list=field_list, - children_of=children_of, - exact=exact, - bytype=bytype, - rows=rows, - num_retries=num_retries, - start=(rows + start), - ) - ) - return docs_found - - if req.status_code == 200 and req.json()["response"]["numFound"] == 0: - if exact: - logger.debug("OLS exact search returned empty response for %s", name) - else: - logger.debug("OLS search returned empty response for %s", name) - return None - elif req.status_code != 200 and req.json()["response"]["numFound"] > 0: - if len(req.json()["response"]["docs"]) <= rows: - return req.json()["response"]["docs"] - else: - start = 0 - docs_found = req.json()["response"]["docs"] - - except Exception as ex: - logger.exception( - "OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex - ) - - return docs_found - - def suggest(self, name, ontology=None): - """Suggest terms from an optional list of ontologies - - .. seealso:: https://www.ebi.ac.uk/ols/docs/api#_suggest_term - """ - params = {"q": name} - if ontology: - params["ontology"] = ",".join(ontology) - response = self.session.get(self.ontology_suggest, params=params) - response.raise_for_status() - - if response.json()["response"]["numFound"]: - return response.json()["response"]["docs"] - logger.debug("OLS suggest returned empty response for %s", name) - return None - - def select(self, name, ontology=None, field_list=None): - """Select terms, - Tuned specifically to support applications such as autocomplete. - - .. see also:: https://www.ebi.ac.uk/ols4/docs/api#_select - """ - params = {"q": name} - if ontology: - params["ontology"] = ",".join(ontology) - if field_list: - params["fieldList"] = ",".join(field_list) - response = self.session.get(self.ontology_select, params=params) - response.raise_for_status() - - if response.json()["response"]["numFound"]: - return response.json()["response"]["docs"] - logger.debug("OLS select returned empty response for %s", name) - return None diff --git a/sdrf_pipelines/zooma/zooma.py b/sdrf_pipelines/zooma/zooma.py deleted file mode 100644 index d9e87515..00000000 --- a/sdrf_pipelines/zooma/zooma.py +++ /dev/null @@ -1,126 +0,0 @@ -import requests -from requests import HTTPError - -from sdrf_pipelines.zooma.ols import OlsClient - - -class OlsTerm: - def __init__(self, iri: str = None, term: str = None, ontology: str = None) -> None: - self._iri = iri - self._term = term - self._ontology = ontology - - def __str__(self) -> str: - return f"{self._term} -- {self._ontology} -- {self._iri}" - - -class SlimOlsClient: - def __init__(self) -> None: - super().__init__() - self._ols_client = OlsClient() - - @staticmethod - def get_term_from_url(url, page_size: int = 100, ontology: str = None): - """ - Return a list of terms by ontology - :param url: - :param page_size: - :param ontology: - :return: - """ - url += "&" + "size=" + str(page_size) - response = requests.get(url) - if response.status_code == 414: - raise HTTPError("URL do not exist in OLS") - json_response = response.json() - old_terms = json_response["_embedded"]["terms"] - old_terms = list(filter(lambda k: ontology in k["ontology_name"], old_terms)) - return [OlsTerm(x["iri"], x["label"], x["ontology_name"]) for x in old_terms] - - -class Zooma: - """ - A Python binding of the Zooma REST API - (http://data.bioontology.org/documentation) - """ - - BASE_URL = "https://www.ebi.ac.uk/spot/zooma/v2/api/services" - - @staticmethod - def process_zooma_results(results): - """ - Get a list of results from a query to Zooma and return a list - of dictionaries containing the queryValue, confidence and ols_url - :param results: List of query terms - :return: - """ - - ontology_terms = [] - for result in results: - ols_term = { - "queryValue": result["annotatedProperty"]["propertyValue"], - "confidence": result["confidence"], - "ols_url": result["_links"]["olslinks"][0]["href"], - } - ontology_terms.append(ols_term) - return ontology_terms - - def recommender(self, text_or_keywords, **kwargs): - """ - # https://www.ebi.ac.uk/spot/zooma/docs/api.html - - Recommender provides a set of ontology terms that match the provided text. - :param text_or_keywords: keyword to search - :param kwargs: filters for ontologies - :return: - """ - endpoint = "/annotate" - full_url = Zooma.BASE_URL + endpoint - payload = kwargs - payload["propertyValue"] = text_or_keywords - return self._zooma_api_request(full_url, "get", payload) - - def _zooma_api_request(self, url, method, payload=None): - if payload is None: - payload = {} - - global r, error_message - processed_payload = self._process_payload(payload) - if method == "get": - r = requests.get(url, params=processed_payload) - elif method == "post": - r = requests.post(url, data=processed_payload) - if r.status_code == 414: - raise HTTPError("Text is too long.") - - json_response = r.json() - - try: - # This will raise an HTTPError if the HTTP request returned an - # unsuccessful status code. - r.raise_for_status() - except HTTPError: - if "errors" in json_response.keys(): - error_messages = json_response["errors"] - error_message = "\n".join(error_messages) - elif "error" in json_response.keys(): - error_message = json_response["error"] - - raise HTTPError(error_message) - - return json_response - - @staticmethod - def process_value(value): - if type(value) is bool: - return str(value).lower() - return value - - def _process_payload(self, payload): - """ - Turn boolean True to str 'true' and False to str 'false'. Otherwise, - server will ignore argument with boolean value. - :param payload: - :return: - """ - return {key: self.process_value(value) for key, value in payload.items()} diff --git a/setup.py b/setup.py index c8418319..cae80cb8 100644 --- a/setup.py +++ b/setup.py @@ -33,11 +33,22 @@ def get_version(rel_path): license="'Apache 2.0", data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])], package_data={ - "": ["*.xml"], + "sdrf-pipelines": ["*.xml", "*.parquet", "*.yml"], }, url="https://github.com/bigbio/sdrf-pipelines", packages=find_packages(), - install_requires=["click", "pandas", "pandas_schema", "requests", "pytest", "pyyaml", "defusedxml"], + install_requires=[ + "click", + "pandas", + "pandas_schema", + "requests", + "pytest", + "pyyaml", + "defusedxml", + "pyarrow", + "duckdb", + "rdflib", + ], entry_points={"console_scripts": ["parse_sdrf = sdrf_pipelines.parse_sdrf:main"]}, platforms=["any"], classifiers=[ diff --git a/tests/test_ontology.py b/tests/test_ontology.py new file mode 100644 index 00000000..96d9e9a0 --- /dev/null +++ b/tests/test_ontology.py @@ -0,0 +1,15 @@ +from sdrf_pipelines.ols.ols import OlsClient + + +def test_ontology(): + ols = OlsClient() + ontology_list = ols.ols_search("homo sapiens", ontology="NCBITaxon") + print(ontology_list) + assert len(ontology_list) > 0 + + +def test_ontology_from_cache(): + ols = OlsClient() + ontology_list = ols.cache_search("homo sapiens", ontology="NCBITaxon") + print(ontology_list) + assert len(ontology_list) > 0 diff --git a/tests/test_sdrfchecker.py b/tests/test_sdrfchecker.py index 589fcbd0..ae02d373 100644 --- a/tests/test_sdrfchecker.py +++ b/tests/test_sdrfchecker.py @@ -1,8 +1,6 @@ import pytest from sdrf_pipelines.parse_sdrf import cli -from sdrf_pipelines.zooma.zooma import SlimOlsClient -from sdrf_pipelines.zooma.zooma import Zooma from .helpers import run_and_check_status_code @@ -49,22 +47,3 @@ def test_on_reference_sdrf(file_subpath, shared_datadir, on_tmpdir): test_sdrf = shared_datadir / file_subpath result = run_and_check_status_code(cli, ["validate-sdrf", "--sdrf_file", str(test_sdrf), "--check_ms"]) assert "ERROR" not in result.output.upper(), result.output - - -def test_bioontologies(): - keyword = "human" - client = Zooma() - results = client.recommender(keyword, filters="ontologies:[nbcitaxon]") - ols_terms = client.process_zooma_results(results) - print(ols_terms) - - ols_client = SlimOlsClient() - for ols_term in ols_terms: - terms = ols_client.get_term_from_url(ols_term["ols_url"], ontology="ncbitaxon") - print(*terms, sep="\n") - - keyword = "Lung adenocarcinoma" - client = Zooma() - results = client.recommender(keyword) - ols_terms = client.process_zooma_results(results) - print(ols_terms)