diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 052e2ec..093667e 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -1,11 +1,15 @@ name: Lint -on: [pull_request] +on: [push, pull_request] jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - - uses: psf/black@stable + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: "3.12" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 6598590..c5d4504 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -6,22 +6,21 @@ on: jobs: deploy: - name: upload release to PyPI runs-on: ubuntu-latest + name: upload release to PyPI permissions: + contents: read id-token: write + steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - run: | - python setup.py sdist bdist_wheel + - name: Install build dependencies + run: python -m pip install --upgrade pip build + - name: Build package + run: python -m build - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 2fa4ad3..d628f1b 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -5,28 +5,26 @@ on: branches: [dev] pull_request: branches: [master, dev] + workflow_dispatch: jobs: pytest: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.9", "3.13"] - os: [ubuntu-latest] + python-version: ["3.10", "3.14"] + os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - - - name: Install package - run: python -m pip install . + - name: Install package with test dependencies + run: python -m pip install ".[test]" - name: Run pytest tests - run: pytest tests -x -vv + run: pytest tests diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index a9b1f4a..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -include requirements/* -include README.md -include docs/img/geofetch_logo.svg -include geofetch/templates/* -include geofetch/templates/config_template.yaml -include geofetch/templates/config_processed_template.yaml -include geofetch/templates/looper_sra_convert.yaml -include geofetch/templates/looper_config_template.yaml -include geofetch/templates/pipeline_interface_convert.yaml diff --git a/geofetch/__init__.py b/geofetch/__init__.py index e1b75f9..f01f520 100644 --- a/geofetch/__init__.py +++ b/geofetch/__init__.py @@ -1,14 +1,21 @@ -"""Package-level data""" +"""Package-level data.""" + +from importlib.metadata import PackageNotFoundError, version import coloredlogs import logmuse -from geofetch._version import __version__ from geofetch.finder import Finder from geofetch.geofetch import Geofetcher -__author__ = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"] -__all__ = ["Finder", "Geofetcher", "__version__"] +__author__: list[str] = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"] + +try: + __version__: str = version("geofetch") +except PackageNotFoundError: + __version__ = "unknown" + +__all__: list[str] = ["Finder", "Geofetcher", "__version__"] _LOGGER = logmuse.init_logger("geofetch") coloredlogs.install( diff --git a/geofetch/_version.py b/geofetch/_version.py deleted file mode 100644 index b665bfc..0000000 --- a/geofetch/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.12.10" diff --git a/geofetch/cli.py b/geofetch/cli.py index 929904a..8586912 100644 --- a/geofetch/cli.py +++ b/geofetch/cli.py @@ -1,21 +1,26 @@ import argparse import os +from importlib.metadata import PackageNotFoundError, version import logmuse from ubiquerg import VersionInHelpParser -from geofetch._version import __version__ - -def _safe_echo(var): - """Returns an environment variable if it exists, or an empty string if not""" +def _safe_echo(var: str) -> str: + """Return an environment variable if it exists, or an empty string if not.""" return os.getenv(var, "") -def _parse_cmdl(cmdl): - """ - parser - """ +def _get_version() -> str: + """Return the package version, or 'unknown' if not installed.""" + try: + return version("geofetch") + except PackageNotFoundError: + return "unknown" + + +def _parse_cmdl(cmdl: list[str]) -> argparse.Namespace: + """Parse command-line arguments for geofetch.""" parser = VersionInHelpParser( description="Automatic GEO and SRA data downloader", usage="""geofetch [] @@ -27,7 +32,7 @@ def _parse_cmdl(cmdl): geofetch -i GSE67303 --processed --geo-folder -m """, - version=__version__, + version=_get_version(), ) processed_group = parser.add_argument_group("processed") @@ -54,7 +59,7 @@ def _parse_cmdl(cmdl): default=_safe_echo("SRAMETA"), help="Specify a parent folder location to store metadata. " "The project name will be added as a subfolder " - "[Default: $SRAMETA:" + _safe_echo("SRAMETA") + "]", + "(Default: $SRAMETA:" + _safe_echo("SRAMETA") + ")", ) parser.add_argument( @@ -87,7 +92,7 @@ def _parse_cmdl(cmdl): default=None, help="Optional: Specify one or more filepaths to SAMPLES pipeline interface yaml files. " "These will be added to the project config file to make it immediately " - "compatible with looper. [Default: null]", + "compatible with looper. (Default: null)", ) # Optional @@ -96,7 +101,7 @@ def _parse_cmdl(cmdl): default=None, help="Optional: Specify one or more filepaths to PROJECT pipeline interface yaml files. " "These will be added to the project config file to make it immediately " - "compatible with looper. [Default: null]", + "compatible with looper. (Default: null)", ) # Optional parser.add_argument( @@ -111,7 +116,7 @@ def _parse_cmdl(cmdl): "--skip", default=0, type=int, - help="Skip some accessions. [Default: no skip].", + help="Skip some accessions. (Default: no skip).", ) parser.add_argument( @@ -132,7 +137,7 @@ def _parse_cmdl(cmdl): type=int, default=50, help="Optional: Limit of the number of the constant sample characters " - "that should not be in project yaml. [Default: 50]", + "that should not be in project yaml. (Default: 50)", ) parser.add_argument( @@ -140,7 +145,7 @@ def _parse_cmdl(cmdl): type=int, default=1000, help="Optional: Limit of the number of the constant sample characters " - "that should not be discarded [Default: 250]", + "that should not be discarded (Default: 250)", ) parser.add_argument( @@ -149,7 +154,7 @@ def _parse_cmdl(cmdl): default=500, help="Optional: Limit of the number of sample characters." "Any attribute with more than X characters will truncate to the first X," - " where X is a number of characters [Default: 500]", + " where X is a number of characters (Default: 500)", ) parser.add_argument( @@ -163,7 +168,7 @@ def _parse_cmdl(cmdl): type=str, default="1GB", help="""Optional: Max size of soft file. - [Default: 1GB]. + (Default: 1GB). Supported input formats : 12B, 12KB, 12MB, 12GB. """, ) @@ -178,7 +183,7 @@ def _parse_cmdl(cmdl): "--processed", default=False, action="store_true", - help="Download processed data [Default: download raw data].", + help="Download processed data (Default: download raw data).", ) processed_group.add_argument( @@ -190,13 +195,13 @@ def _parse_cmdl(cmdl): " to retrieve processed data, which may be attached to the" " collective series entity, or to individual samples. " "Allowable values are: samples, series or both (all). " - "Ignored unless 'processed' flag is set. [Default: samples]", + "Ignored unless 'processed' flag is set. (Default: samples)", ) processed_group.add_argument( "--filter", default=None, - help="Optional: Filter regex for processed filenames [Default: None]." + help="Optional: Filter regex for processed filenames (Default: None)." "Ignored unless 'processed' flag is set.", ) @@ -205,7 +210,7 @@ def _parse_cmdl(cmdl): dest="filter_size", default=None, help="""Optional: Filter size for processed files - that are stored as sample repository [Default: None]. + that are stored as sample repository (Default: None). Works only for sample data. Supported input formats : 12B, 12KB, 12MB, 12GB. Ignored unless 'processed' flag is set.""", @@ -217,7 +222,7 @@ def _parse_cmdl(cmdl): default=_safe_echo("GEODATA"), help="Optional: Specify a location to store processed GEO files." " Ignored unless 'processed' flag is set." - "[Default: $GEODATA:" + _safe_echo("GEODATA") + "]", + "(Default: $GEODATA:" + _safe_echo("GEODATA") + ")", ) raw_group.add_argument( @@ -238,7 +243,9 @@ def _parse_cmdl(cmdl): default=_safe_echo("SRABAM"), help="""Optional: Specify folder of bam files. Geofetch will not download sra files when corresponding bam files already exist. - [Default: $SRABAM:""" + _safe_echo("SRABAM") + "]", + (Default: $SRABAM:""" + + _safe_echo("SRABAM") + + ")", ) raw_group.add_argument( @@ -248,7 +255,9 @@ def _parse_cmdl(cmdl): default=_safe_echo("SRAFQ"), help="""Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding fastq files already exist. - [Default: $SRAFQ:""" + _safe_echo("SRAFQ") + "]", + (Default: $SRAFQ:""" + + _safe_echo("SRAFQ") + + ")", ) # Deprecated; these are for bam conversion which now happens in sra_convert @@ -260,7 +269,7 @@ def _parse_cmdl(cmdl): default=_safe_echo("SRARAW"), help=argparse.SUPPRESS, # help="Optional: Specify a location to store sra files " - # "[Default: $SRARAW:" + safe_echo("SRARAW") + "]" + # "(Default: $SRARAW:" + safe_echo("SRARAW") + ")" ) raw_group.add_argument( "--bam-conversion", @@ -274,7 +283,7 @@ def _parse_cmdl(cmdl): dest="picard_path", default=_safe_echo("PICARD"), # help="Specify a path to the picard jar, if you want to convert " - # "fastq to bam [Default: $PICARD:" + safe_echo("PICARD") + "]", + # "fastq to bam (Default: $PICARD:" + safe_echo("PICARD") + ")", help=argparse.SUPPRESS, ) diff --git a/geofetch/const.py b/geofetch/const.py index e018920..71fc96a 100644 --- a/geofetch/const.py +++ b/geofetch/const.py @@ -1,10 +1,12 @@ +"""Package constants.""" + import re _LOGGER = None # A set of hard-coded keys if you want to limit to just a few instead of taking # all information provided in GEO. Use with `--use-key-subset` -ANNOTATION_SHEET_KEYS = [ +ANNOTATION_SHEET_KEYS: list[str] = [ "sample_name", "protocol", "read_type", @@ -24,49 +26,49 @@ ] # Regex to parse out SRA accession identifiers -PROJECT_PATTERN = re.compile(r"(SRP\d{4,8})") -EXPERIMENT_PATTERN = re.compile(r"(SRX\d{4,8})") -GSE_PATTERN = re.compile(r"(GSE\d{4,8})") -SUPP_FILE_PATTERN = re.compile("Sample_supplementary_file") -SER_SUPP_FILE_PATTERN = re.compile("Series_supplementary_file") +PROJECT_PATTERN: re.Pattern[str] = re.compile(r"(SRP\d{4,8})") +EXPERIMENT_PATTERN: re.Pattern[str] = re.compile(r"(SRX\d{4,8})") +GSE_PATTERN: re.Pattern[str] = re.compile(r"(GSE\d{4,8})") +SUPP_FILE_PATTERN: re.Pattern[str] = re.compile("Sample_supplementary_file") +SER_SUPP_FILE_PATTERN: re.Pattern[str] = re.compile("Series_supplementary_file") -SAMPLE_SUPP_METADATA_FILE = "_samples.csv" -EXP_SUPP_METADATA_FILE = "_series.csv" -FILE_RAW_NAME_SAMPLE_PATTERN = "_raw.csv" -FILE_RAW_NAME_SUBSAMPLE_PATTERN = "_raw_subtable.csv" +SAMPLE_SUPP_METADATA_FILE: str = "_samples.csv" +EXP_SUPP_METADATA_FILE: str = "_series.csv" +FILE_RAW_NAME_SAMPLE_PATTERN: str = "_raw.csv" +FILE_RAW_NAME_SUBSAMPLE_PATTERN: str = "_raw_subtable.csv" # How many times should we retry failing prefetch call? -NUM_RETRIES = 3 -REQUEST_SLEEP = 0.4 +NUM_RETRIES: int = 3 +REQUEST_SLEEP: float = 0.4 -NCBI_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={SRP_NUMBER}&retmax=999&rettype=uilist&retmode=json" -NCBI_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={ID}&rettype=runinfo&retmode=xml" +NCBI_ESEARCH: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={SRP_NUMBER}&retmax=999&rettype=uilist&retmode=json" +NCBI_EFETCH: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={ID}&rettype=runinfo&retmode=xml" -NEW_GENOME_COL_NAME = "ref_genome" +NEW_GENOME_COL_NAME: str = "ref_genome" -TEMPLATES_DIR = "templates" -CONFIG_PROCESSED_TEMPLATE_NAME = "config_processed_template.yaml" -CONFIG_RAW_TEMPLATE_NAME = "config_template.yaml" -CONFIG_SRA_TEMPLATE_NAME = "looper_sra_convert.yaml" -PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME = "pipeline_interface_convert.yaml" -LOOPER_SRA_CONVERT = "looper_config_template.yaml" -# SRA_CONVERT_SCHEMA_NAME = "sra_convert_schema.yaml" -# RESOURCES_NAME = "resources.tsv" +TEMPLATES_DIR: str = "templates" +CONFIG_PROCESSED_TEMPLATE_NAME: str = "config_processed_template.yaml" +CONFIG_RAW_TEMPLATE_NAME: str = "config_template.yaml" +CONFIG_SRA_TEMPLATE_NAME: str = "looper_sra_convert.yaml" +PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME: str = "pipeline_interface_convert.yaml" +LOOPER_SRA_CONVERT: str = "looper_config_template.yaml" # const for Finder: -RETMAX = 10000000 # once it should be increased +RETMAX: int = 10000000 # gds = geo DataSets -ETOOLS_GEO_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds" -ETOOLS_GEO_GSE_BASE = f"{ETOOLS_GEO_BASE}&term=GSE[ETYP]" +ETOOLS_GEO_BASE: str = ( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds" +) +ETOOLS_GEO_GSE_BASE: str = f"{ETOOLS_GEO_BASE}&term=GSE[ETYP]" -ETOOLS_ENDING = "&retmax={retmax}&usehistory=y" +ETOOLS_ENDING: str = "&retmax={retmax}&usehistory=y" -TODAY_DATE = "3000" +TODAY_DATE: str = "3000" -DATE_FILTER = ( +DATE_FILTER: str = ( '+AND+("{start_date}"[Publication%20Date]%20:%20"{end_date}"[Publication%20Date])' ) -THREE_MONTH_FILTER = '+AND+"published+last+3+months"[Filter]' +THREE_MONTH_FILTER: str = '+AND+"published+last+3+months"[Filter]' -LOOPER_CONFIG_FILE_NAME = "looper_config.yaml" +LOOPER_CONFIG_FILE_NAME: str = "looper_config.yaml" diff --git a/geofetch/finder.py b/geofetch/finder.py index 587e1ae..b9f7b69 100644 --- a/geofetch/finder.py +++ b/geofetch/finder.py @@ -27,72 +27,91 @@ class Finder: - """ - Class for finding GSE accessions in special period of time. - Additionally, user can add specific filters for the search, - while initialization of the class - """ + """Class for finding GSE accessions in special period of time.""" - def __init__(self, filters: str = None, retmax: int = RETMAX): + def __init__(self, filters: str | None = None, retmax: int = RETMAX) -> None: """ - :param filters: filters that have to be added to the query. - Filter Patterns can be found here: - https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag - :param retmax: maximum number of retrieved accessions. + Initialize Finder with optional filters and result limit. + + Args: + filters: Filters that have to be added to the query. + Filter Patterns can be found here: + https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag + retmax: Maximum number of retrieved accessions. """ self.query_customized_ending = ETOOLS_ENDING.format(retmax=retmax) self.query_filter_str = self._create_filter_str(filters) self.last_result = [] - def get_gse_all(self) -> list: + def get_gse_all(self) -> list[str]: """ - Get list of all gse accession available in GEO - :return: list of gse accession + Get list of all gse accessions available in GEO. + + Returns: + List of gse accessions. """ return self.get_gse_id_by_query(url=self._compose_url()) - def get_gse_last_3_month(self) -> list: + def get_gse_last_3_month(self) -> list[str]: """ - Get list of gse accession that were uploaded or updated in last 3 month - :return: list of gse accession + Get list of gse accessions that were uploaded or updated in last 3 months. + + Returns: + List of gse accessions. """ return self.get_gse_id_by_query(url=self._compose_url(THREE_MONTH_FILTER)) - def get_gse_last_week(self) -> list: + def get_gse_last_week(self) -> list[str]: """ - Get list of gse accession that were uploaded or updated in last week - :return: list of gse accession + Get list of gse accessions that were uploaded or updated in last week. + + Returns: + List of gse accessions. """ return self.get_gse_by_day_count(7) - def get_gse_by_day_count(self, n_days: int = 1) -> list: + def get_gse_by_day_count(self, n_days: int = 1) -> list[str]: """ - Get list of gse accessions that were uploaded or updated in last X days - :param n_days: number of days from now [e.g. 5] - :return: list of gse accession + Get list of gse accessions that were uploaded or updated in last X days. + + Args: + n_days: Number of days from now [e.g. 5]. + + Returns: + List of gse accessions. """ today = datetime.today() start_date = today - timedelta(days=n_days) start_date_str = start_date.strftime("%Y/%m/%d") return self.get_gse_by_date(start_date_str) - def get_gse_by_date(self, start_date: str, end_date: str = None) -> list: + def get_gse_by_date( + self, start_date: str, end_date: str | None = None + ) -> list[str]: """ Search gse accessions by providing start date and end date. By default, the last date is today. - :param start_date: the oldest date of update (from YYYY/MM/DD to now) [input format: 'YYYY/MM/DD'] - :param end_date: the nearest date of update (from __ to YYYY/MM/DD) [input format: 'YYYY/MM/DD'] - :return: list of gse accessions + + Args: + start_date: The oldest date of update (from YYYY/MM/DD to now) [input format: 'YYYY/MM/DD']. + end_date: The nearest date of update (from __ to YYYY/MM/DD) [input format: 'YYYY/MM/DD']. + + Returns: + List of gse accessions. """ if end_date is None: end_date = TODAY_DATE new_date_filter = DATE_FILTER.format(start_date=start_date, end_date=end_date) return self.get_gse_id_by_query(url=self._compose_url(new_date_filter)) - def get_gse_id_by_query(self, url: str) -> list: + def get_gse_id_by_query(self, url: str) -> list[str]: """ - Run esearch (ncbi search tool) by specifying URL and retrieve gse list result - :param url: url of the query - :return: list of gse ids + Run esearch (ncbi search tool) by specifying URL and retrieve gse list result. + + Args: + url: URL of the query. + + Returns: + List of gse ids. """ uids_list = self._run_search_query(url) gse_id_list = [self.uid_to_gse(d) for d in uids_list] @@ -102,29 +121,41 @@ def get_gse_id_by_query(self, url: str) -> list: @staticmethod def uid_to_gse(uid: str) -> str: """ - UID to GES accession converter - :param uid: uid string (Unique Identifier Number in GEO) - :return: GSE id string + UID to GSE accession converter. + + Args: + uid: UID string (Unique Identifier Number in GEO). + + Returns: + GSE id string. """ uid_regex = re.compile(r"[1-9]+0+([1-9]+[0-9]*)") return "GSE" + uid_regex.match(uid).group(1) @staticmethod - def find_differences(old_list: list, new_list: list) -> list: + def find_differences(old_list: list[str], new_list: list[str]) -> list[str]: """ - Compare 2 lists and search for elements that are not in old list - :param old_list: old list of elements - :param new_list: new list of elements - :return: list of elements that are not in old list but are in new_list + Compare 2 lists and search for elements that are not in old list. + + Args: + old_list: Old list of elements. + new_list: New list of elements. + + Returns: + List of elements that are not in old list but are in new_list. """ return list(set(new_list) - set(old_list)) @staticmethod - def _run_search_query(url: str) -> list: + def _run_search_query(url: str) -> list[str]: """ - Run get request and return list of uids found - :param url: url of the query - :return: list of UIDs + Run get request and return list of uids found. + + Args: + url: URL of the query. + + Returns: + List of UIDs. """ x = requests.get(url) if x.status_code != 200: @@ -142,33 +173,42 @@ def _run_search_query(url: str) -> list: return [] @staticmethod - def _create_filter_str(filters: str = None) -> str: + def _create_filter_str(filters: str | None = None) -> str: """ - Tune filter for url request - :param filters: filter should look like here: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag - :return: tuned filter string + Tune filter for url request. + + Args: + filters: Filter should look like here: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag + + Returns: + Tuned filter string. """ if filters == "" or filters is None: return "" return f"+(AND+{filters})" - def _compose_url(self, date_filter: str = None) -> str: + def _compose_url(self, date_filter: str | None = None) -> str: """ - Compose final url by adding date filter - :param date_filter: date filter that has to be used in the query - :return: string of final url + Compose final url by adding date filter. + + Args: + date_filter: Date filter that has to be used in the query. + + Returns: + String of final url. """ if date_filter is None: date_filter = "" return f"{ETOOLS_GEO_GSE_BASE}{self.query_filter_str}{date_filter}{self.query_customized_ending}" - def generate_file(self, file_path: str, gse_list: list = None): + def generate_file(self, file_path: str, gse_list: list[str] | None = None) -> None: """ - Save the list of GSE accessions stored in this Finder object to a given file - :param file_path: root to the file where gse accessions have to be saved - :param gse_list: list of gse accessions - :return: NoReturn + Save the list of GSE accessions stored in this Finder object to a given file. + + Args: + file_path: Root to the file where gse accessions have to be saved. + gse_list: List of gse accessions. """ if gse_list is None: gse_list = self.last_result diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 234d375..b61bd76 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -5,7 +5,6 @@ import re import sys import time -from typing import Dict, List, NoReturn, Tuple, Union import logmuse import pandas as pd @@ -26,18 +25,18 @@ FILE_RAW_NAME_SAMPLE_PATTERN, FILE_RAW_NAME_SUBSAMPLE_PATTERN, GSE_PATTERN, + LOOPER_CONFIG_FILE_NAME, + LOOPER_SRA_CONVERT, NCBI_EFETCH, NCBI_ESEARCH, NEW_GENOME_COL_NAME, NUM_RETRIES, + PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME, PROJECT_PATTERN, SAMPLE_SUPP_METADATA_FILE, SER_SUPP_FILE_PATTERN, SUPP_FILE_PATTERN, TEMPLATES_DIR, - PIPELINE_INTERFACE_CONVERT_TEMPLATE_NAME, - LOOPER_SRA_CONVERT, - LOOPER_CONFIG_FILE_NAME, ) from geofetch.utils import ( Accession, @@ -70,9 +69,7 @@ class Geofetcher: - """ - Class to download or get projects, metadata, data from GEO and SRA - """ + """Class to download or get projects, metadata, data from GEO and SRA.""" def __init__( self, @@ -81,16 +78,16 @@ def __init__( metadata_folder: str = "", just_metadata: bool = False, refresh_metadata: bool = False, - config_template: str = None, - pipeline_samples: str = None, - pipeline_project: str = None, + config_template: str | None = None, + pipeline_samples: str | None = None, + pipeline_project: str | None = None, skip: int = 0, acc_anno: bool = False, use_key_subset: bool = False, processed: bool = False, data_source: str = "samples", - filter: str = None, - filter_size: str = None, + filter: str | None = None, + filter_size: str | None = None, geo_folder: str = ".", split_experiments: bool = False, bam_folder: str = "", @@ -98,7 +95,7 @@ def __init__( sra_folder: str = "", bam_conversion: bool = False, picard_path: str = "", - input: str = None, + input: str | None = None, const_limit_project: int = 50, const_limit_discard: int = 1000, attr_limit_truncate: int = 500, @@ -107,76 +104,69 @@ def __init__( add_dotfile: bool = False, disable_progressbar: bool = False, add_convert_modifier: bool = False, - opts=None, - max_prefetch_size=None, - **kwargs, - ): - """ - Constructor - - :param input: GSEnumber or path to the input file - :param name: Specify a project name. Defaults to GSE number or name of accessions file name - :param metadata_root: Specify a parent folder location to store metadata. - The project name will be added as a subfolder [Default: $SRAMETA:] - :param metadata_folder: Specify an absolute folder location to store metadata. No subfolder will be added. - Overrides value of --metadata-root [Default: Not used (--metadata-root is used by default)] - :param just_metadata: If set, don't actually run downloads, just create metadata - :param refresh_metadata: If set, re-download metadata even if it exists. - :param config_template: Project config yaml file template. - :param pipeline_samples: Specify one or more filepaths to SAMPLES pipeline interface yaml files. + opts: object | None = None, + max_prefetch_size: str | int | None = None, + **kwargs: object, + ) -> None: + """ + Constructor. + + Args: + input: GSE number or path to the input file. + name: Specify a project name. Defaults to GSE number or name of accessions file name. + metadata_root: Specify a parent folder location to store metadata. + The project name will be added as a subfolder (Default: $SRAMETA:). + metadata_folder: Specify an absolute folder location to store metadata. No subfolder will be added. + Overrides value of --metadata-root (Default: Not used (--metadata-root is used by default)). + just_metadata: If set, don't actually run downloads, just create metadata. + refresh_metadata: If set, re-download metadata even if it exists. + config_template: Project config yaml file template. + pipeline_samples: Specify one or more filepaths to SAMPLES pipeline interface yaml files. These will be added to the project config file to make it immediately compatible with looper. - [Default: null] - :param pipeline_project: Specify one or more filepaths to PROJECT pipeline interface yaml files. + (Default: null). + pipeline_project: Specify one or more filepaths to PROJECT pipeline interface yaml files. These will be added to the project config file to make it immediately compatible with looper. - [Default: null] - :param acc_anno: Produce annotation sheets for each accession. + (Default: null). + acc_anno: Produce annotation sheets for each accession. Project combined PEP for the whole project won't be produced. - :param discard_soft: Create project without downloading soft files on the disc - :param add_dotfile: Add .pep.yaml file that points .yaml PEP file - :param disable_progressbar: Set true to disable progressbar - - :param const_limit_project: Optional: Limit of the number of the constant sample characters - that should not be in project yaml. [Default: 50] - :param const_limit_discard: Optional: Limit of the number of the constant sample characters - that should not be discarded [Default: 250] - :param attr_limit_truncate: Optional: Limit of the number of sample characters. + discard_soft: Create project without downloading soft files on the disc. + add_dotfile: Add .pep.yaml file that points .yaml PEP file. + disable_progressbar: Set true to disable progressbar. + const_limit_project: Optional: Limit of the number of the constant sample characters + that should not be in project yaml. (Default: 50). + const_limit_discard: Optional: Limit of the number of the constant sample characters + that should not be discarded (Default: 250). + attr_limit_truncate: Optional: Limit of the number of sample characters. Any attribute with more than X characters will truncate to the first X, where X is a number of characters - [Default: 500] - - :param max_soft_size: Optional: Max size of soft file. - Supported input formats : 12B, 12KB, 12MB, 12GB. [Default value: 1GB] - - :param processed: Download processed da_soft_sizeta [Default: download raw data]. - :param data_source: Specifies the source of data on the GEO record to retrieve processed data, + (Default: 500). + max_soft_size: Optional: Max size of soft file. + Supported input formats: 12B, 12KB, 12MB, 12GB. [Default value: 1GB]. + processed: Download processed data (Default: download raw data). + data_source: Specifies the source of data on the GEO record to retrieve processed data, which may be attached to the collective series entity, or to individual samples. Allowable values are: - samples, series or both (all). Ignored unless 'processed' flag is set. [Default: samples] - :param filter: Filter regex for processed filenames [Default: None].Ignored unless 'processed' flag is set. - :param filter_size: Filter size for processed files that are stored as sample repository [Default: None]. - Works only for sample data. Supported input formats : 12B, 12KB, 12MB, 12GB. + samples, series or both (all). Ignored unless 'processed' flag is set. (Default: samples). + filter: Filter regex for processed filenames (Default: None). Ignored unless 'processed' flag is set. + filter_size: Filter size for processed files that are stored as sample repository (Default: None). + Works only for sample data. Supported input formats: 12B, 12KB, 12MB, 12GB. Ignored unless 'processed' flag is set. - :param geo_folder: Specify a location to store processed GEO files. - Ignored unless 'processed' flag is set.[Default: $GEODATA:] - - :param split_experiments: Split SRR runs into individual samples. By default, SRX experiments with multiple SRR + geo_folder: Specify a location to store processed GEO files. + Ignored unless 'processed' flag is set. (Default: $GEODATA:). + split_experiments: Split SRR runs into individual samples. By default, SRX experiments with multiple SRR Runs will have a single entry in the annotation table, with each run as a separate row in the - subannotation table. This setting instead treats each run as a separate sample [Works with raw data] - :param bam_folder: Optional: Specify folder of bam files. Geofetch will not download sra files when - corresponding bam files already exist. [Default: $SRABAM:] [Works with raw data] - :param fq_folder: Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding - fastq files already exist. [Default: $SRAFQ:] [Works with raw data] - :param use_key_subset: Use just the keys defined in this module when writing out metadata. [Works with raw data] - :param sra_folder: Optional: Specify a location to store sra files - [Default: $SRARAW:" + safe_echo("SRARAW") + ] - :param bam_conversion: Optional: set True to convert bam files [Works with raw data] - :param picard_path: Specify a path to the picard jar, if you want to convert fastq to bam - [Default: $PICARD:" + safe_echo("PICARD") + "] [Works with raw data] - :param add_convert_modifier: Add looper SRA convert modifier to config file. - - :param skip: Skip some accessions. [Default: no skip]. - :param opts: opts object [Optional] - :param str | int max_prefetch_size: argmuent to prefetch command's --max-size option; - for reference: https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump#check-the-maximum-size-limit-of-the-prefetch-tool - :param kwargs: other values + subannotation table. This setting instead treats each run as a separate sample [Works with raw data]. + bam_folder: Optional: Specify folder of bam files. Geofetch will not download sra files when + corresponding bam files already exist. (Default: $SRABAM:) [Works with raw data]. + fq_folder: Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding + fastq files already exist. (Default: $SRAFQ:) [Works with raw data]. + use_key_subset: Use just the keys defined in this module when writing out metadata. [Works with raw data]. + sra_folder: Optional: Specify a location to store sra files. + bam_conversion: Optional: set True to convert bam files [Works with raw data]. + picard_path: Specify a path to the picard jar, if you want to convert fastq to bam [Works with raw data]. + add_convert_modifier: Add looper SRA convert modifier to config file. + skip: Skip some accessions. (Default: no skip). + opts: opts object [Optional]. + max_prefetch_size: Argument to prefetch command's --max-size option. + kwargs: Other values. """ global _LOGGER @@ -291,11 +281,15 @@ def get_projects( self, input: str, just_metadata: bool = True, discard_soft: bool = True ) -> dict: """ - Function for fetching projects from GEO|SRA and receiving peppy project - :param input: GSE number, or path to file of GSE numbers - :param just_metadata: process only metadata - :param discard_soft: clean run, without downloading soft files - :return: peppy project or list of project, if acc_anno is set. + Fetch projects from GEO|SRA and return peppy projects. + + Args: + input: GSE number, or path to file of GSE numbers. + just_metadata: Process only metadata. + discard_soft: Clean run, without downloading soft files. + + Returns: + Peppy project or list of projects, if acc_anno is set. """ self.just_metadata = just_metadata self.just_object = True @@ -357,13 +351,18 @@ def get_projects( return new_pr_dict - def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Project]: + def fetch_all(self, input: str, name: str | None = None) -> None | peppy.Project: """ - Main function driver/workflow - Function that search, filters, downloads and save data and metadata from GEO and SRA - :param input: GSE or input file with gse's - :param name: Name of the project - :return: NoReturn or peppy Project + Main function driver/workflow. + + Searches, filters, downloads and saves data and metadata from GEO and SRA. + + Args: + input: GSE or input file with GSE accessions. + name: Name of the project. + + Returns: + None or peppy Project. """ if name is not None: @@ -581,16 +580,20 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje def _process_sra_meta( self, - srp_list_result: list = None, - gsm_enter_dict: dict = None, - gsm_metadata: dict = None, - ): + srp_list_result: list | None = None, + gsm_enter_dict: dict | None = None, + gsm_metadata: dict | None = None, + ) -> tuple[dict, dict, list]: """ - Create srp multitable and update gsm_metadata based on srp - :param srp_list_result: list of srp got from sra file - :param gsm_enter_dict: gsm enter content - :param gsm_metadata: dict of samples of gsm - :return: srp multitable + Create SRP multitable and update gsm_metadata based on SRP. + + Args: + srp_list_result: List of SRP records from SRA file. + gsm_enter_dict: GSM enter content. + gsm_metadata: Dict of samples of GSM. + + Returns: + Tuple of (gsm_multi_table, gsm_metadata, runs). """ gsm_multi_table = {} runs = [] @@ -665,11 +668,12 @@ def _process_sra_meta( return gsm_multi_table, gsm_metadata, runs - def _download_raw_data(self, run_name: str) -> NoReturn: + def _download_raw_data(self, run_name: str) -> None: """ - Download raw data from SRA by providing run name + Download raw data from SRA by providing run name. - :param run_name: Run name from SRA + Args: + run_name: Run name from SRA. """ bam_file = ( "" @@ -716,16 +720,20 @@ def _download_raw_data(self, run_name: str) -> NoReturn: def fetch_processed_one( self, - gse_file_content: list, - gsm_file_content: list, + gse_file_content: list[str], + gsm_file_content: list[str], gsm_filter_list: dict, - ) -> Tuple: + ) -> tuple[list, list]: """ - Fetche one processed GSE project and return its metadata - :param gsm_file_content: gse soft file content - :param gse_file_content: gsm soft file content - :param gsm_filter_list: list of gsm that have to be downloaded - :return: Tuple of project list of gsm samples and gse samples + Fetch one processed GSE project and return its metadata. + + Args: + gse_file_content: GSE soft file content. + gsm_file_content: GSM soft file content. + gsm_filter_list: List of GSM that have to be downloaded. + + Returns: + Tuple of (meta_processed_samples, meta_processed_series). """ ( meta_processed_samples, @@ -752,17 +760,19 @@ def _generate_processed_meta( name: str, meta_processed_samples: list, meta_processed_series: list, - gse_meta_dict: Union[dict, None] = None, + gse_meta_dict: dict | None = None, ) -> dict: """ - Generate and save PEPs for processed accessions. GEO has data in GSE and GSM, - conditions are used to decide which PEPs will be saved. - :param name: name of the folder/file where PEP will be saved - :param meta_processed_samples: - :param meta_processed_series: - :param gse_meta_dict: dict of metadata fetched from one experiment. - Used to add this data to config file. - :return: dict of objects if just_object is set, otherwise dicts of None + Generate and save PEPs for processed accessions. + + Args: + name: Name of the folder/file where PEP will be saved. + meta_processed_samples: List of processed sample metadata. + meta_processed_series: List of processed series metadata. + gse_meta_dict: Dict of metadata fetched from one experiment. + + Returns: + Dict of objects if just_object is set, otherwise dicts of None. """ return_objects = {f"{name}_samples": None, f"{name}_series": None} @@ -822,13 +832,14 @@ def _generate_processed_meta( def _download_processed_data( self, acc_gse: str, meta_processed_samples: list, meta_processed_series: list - ) -> NoReturn: + ) -> None: """ - Download processed data from GEO by providing project annotation list - :param acc_gse: accession number of the project - :param meta_processed_samples: list of annotation of samples - :param meta_processed_series: list of annotation of series - :return: Noreturn + Download processed data from GEO by providing project annotation list. + + Args: + acc_gse: Accession number of the project. + meta_processed_samples: List of annotation of samples. + meta_processed_series: List of annotation of series. """ data_geo_folder = os.path.join(self.geo_folder, acc_gse) _LOGGER.debug("Data folder: " + data_geo_folder) @@ -862,10 +873,13 @@ def _download_processed_data( def _expand_metadata_dict(self, metadata_dict: dict) -> dict: """ - Expand all lists of all items in the dict by creating new items or joining them + Expand all lists of all items in the dict by creating new items or joining them. + + Args: + metadata_dict: Metadata dict. - :param metadata_dict: metadata dict - :return: expanded metadata dict + Returns: + Expanded metadata dict. """ prj_list = _dict_to_list_converter(proj_dict=metadata_dict) prj_list = self._expand_metadata_list(prj_list) @@ -873,10 +887,13 @@ def _expand_metadata_dict(self, metadata_dict: dict) -> dict: def _expand_metadata_list(self, metadata_list: list) -> list: """ - Expanding all lists of all items in the list by creating new items or joining them + Expand all lists of all items in the list by creating new items or joining them. - :param list metadata_list: list of dicts that store metadata - :return list: expanded metadata list + Args: + metadata_list: List of dicts that store metadata. + + Returns: + Expanded metadata list. """ _LOGGER.info("Expanding metadata list...") list_of_keys = _get_list_of_keys(metadata_list) @@ -886,14 +903,16 @@ def _expand_metadata_list(self, metadata_list: list) -> list: metadata_list = self._expand_metadata_list_item(metadata_list, key_in_list) return metadata_list - def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): + def _expand_metadata_list_item(self, metadata_list: list, dict_key: str) -> list: """ - Expand list of one element (item) in the list by creating new items or joining them - ["first1: fff", ...] -> separate columns + Expand list of one element (item) in the list by creating new items or joining them. + + Args: + metadata_list: List of dicts that store metadata. + dict_key: Key in the dictionaries that have to be expanded. - :param list metadata_list: list of dicts that store metadata - :param str dict_key: key in the dictionaries that have to be expanded - :return list: expanded metadata list + Returns: + Expanded metadata list. """ try: element_is_list = any( @@ -995,10 +1014,12 @@ def _write_gsm_annotation(self, gsm_metadata: dict, file_annotation: str) -> str """ Write metadata sheet out as an annotation file. - :param Mapping gsm_metadata: the data to write, parsed from a file - with metadata/annotation information - :param str file_annotation: the path to the file to write - :return str: path to the file + Args: + gsm_metadata: The data to write, parsed from a file with metadata/annotation information. + file_annotation: The path to the file to write. + + Returns: + Path to the file written. """ keys = list(list(gsm_metadata.values())[0].keys()) fp = expandpath(file_annotation) @@ -1018,17 +1039,19 @@ def _write_processed_annotation( processed_metadata: list, file_annotation_path: str, just_object: bool = False, - gse_meta_dict: dict = None, - ) -> Union[NoReturn, peppy.Project]: + gse_meta_dict: dict | None = None, + ) -> None | peppy.Project: """ - Save annotation file by providing list of dictionaries with files metadata + Save annotation file by providing list of dictionaries with files metadata. + + Args: + processed_metadata: List of dictionaries with files metadata. + file_annotation_path: The path to the metadata file that has to be saved. + just_object: True if you want to get peppy object without saving file. + gse_meta_dict: Dict of metadata fetched from one experiment. - :param list processed_metadata: list of dictionaries with files metadata - :param str file_annotation_path: the path to the metadata file that has to be saved - :param just_object: True, if you want to get peppy object without saving file - :param gse_meta_dict: dict of metadata fetched from one experiment. - Used to add this data to config file. - :return: none, or peppy project + Returns: + None, or peppy Project. """ if len(processed_metadata) == 0: _LOGGER.info( @@ -1095,10 +1118,13 @@ def _write_processed_annotation( @staticmethod def _find_genome(metadata_list: list) -> list: """ - Create new genome column by searching joining few columns + Create new genome column by searching and joining few columns. - :param metadata_list: list with metadata dict - :return: list with metadata dict where genome column was added + Args: + metadata_list: List with metadata dicts. + + Returns: + List with metadata dicts where genome column was added. """ list_keys = _get_list_of_keys(metadata_list) genome_keys = [ @@ -1116,20 +1142,22 @@ def _find_genome(metadata_list: list) -> list: def _write_raw_annotation_new( self, - name, + name: str, metadata_dict: dict, - subannot_dict: dict = None, - gse_meta_dict: dict = None, - ) -> Union[None, peppy.Project]: + subannot_dict: dict | None = None, + gse_meta_dict: dict | None = None, + ) -> None | peppy.Project: """ - Combine individual accessions into project-level annotations, and writing - individual accession files (if requested) + Combine individual accessions into project-level annotations. + + Args: + name: Name of the run, project, or accession; influences the folder name. + metadata_dict: Dictionary of sample annotations. + subannot_dict: Dictionary of subsample annotations. + gse_meta_dict: Dict of experiment metadata stored in GSE. - :param name: Name of the run, project, or acc --> will influence name of the folder where project will be created - :param metadata_dict: dictionary of sample annotations - :param subannot_dict: dictionary of subsample annotations - :param gse_meta_dict: dict of experiment metadata that was sotred in gse - :return: none or peppy object + Returns: + None or peppy Project. """ try: assert len(metadata_dict) > 0 @@ -1250,15 +1278,18 @@ def _create_config_processed( self, file_annotation_path: str, proj_meta: list, - meta_in_series: dict = True, + meta_in_series: dict | None = None, ) -> str: """ - Compose and generate config file content + Compose and generate config file content for processed data. - :param file_annotation_path: root to the annotation file - :param proj_meta: common metadata that has to added to config file - :param meta_in_series: - :return: generated, complete config file content + Args: + file_annotation_path: Path to the annotation file. + proj_meta: Common metadata to add to config file. + meta_in_series: Series metadata dict. + + Returns: + Generated config file content. """ geofetchdir = os.path.dirname(__file__) @@ -1295,16 +1326,23 @@ def _create_config_processed( return template def _create_config_raw( - self, proj_meta, proj_root_sample, subanot_path_yaml, meta_in_series=None - ): + self, + proj_meta: list, + proj_root_sample: str, + subanot_path_yaml: str, + meta_in_series: dict | None = None, + ) -> str: """ - Compose and generate config file content for raw data + Compose and generate config file content for raw data. - :param proj_meta: root to the annotation file - :param proj_root_sample: path to sampletable file - :param subanot_path_yaml: path to subannotation file - :param meta_in_series: - :return: generated, complete config file content + Args: + proj_meta: Common metadata to add to config file. + proj_root_sample: Path to sampletable file. + subanot_path_yaml: Path to subannotation file. + meta_in_series: Series metadata dict. + + Returns: + Generated config file content. """ meta_list_str = [ f'{list(i.keys())[0]}: "{_sanitize_config_string(list(i.values())[0])}"' @@ -1360,11 +1398,13 @@ def _create_config_raw( @staticmethod def _check_sample_name_standard(metadata_dict: dict) -> dict: """ - Standardize sample name and checking if it exists - (This function is used for raw data) + Standardize sample names and check if they exist. + + Args: + metadata_dict: Metadata dict. - :param metadata_dict: metadata dict - :return: metadata dict with standardize sample names + Returns: + Metadata dict with standardized sample names. """ fixed_dict = {} for key_sample, value_sample in metadata_dict.items(): @@ -1381,21 +1421,22 @@ def _check_sample_name_standard(metadata_dict: dict) -> dict: @staticmethod def _separate_common_meta( - meta_list: Union[List, Dict], + meta_list: list | dict, max_len: int = 50, del_limit: int = 1000, attr_limit_truncate: int = 500, ) -> tuple: """ - Separate experiment(project) metadata from sample metadata + Separate experiment(project) metadata from sample metadata. + + Args: + meta_list: List of dictionaries of samples. + max_len: Threshold of the length of the common value that can be stored in the sample table. + del_limit: Threshold of the length of the common value that have to be deleted. + attr_limit_truncate: Max length of the attribute in the sample csv. - :param list or dict meta_list: list of dictionaries of samples - :param int max_len: threshold of the length of the common value that can be stored in the sample table - :param int del_limit: threshold of the length of the common value that have to be deleted - :param int attr_limit_truncate: max length of the attribute in the sample csv - :return set: Return is a set of list, where 1 list (or dict) is - list of samples metadata dictionaries and 2: list of common samples metadata - dictionaries that are linked to the project. + Returns: + Tuple of (sample metadata list/dict, common project metadata list). """ # check if meta_list is dict and converting it to list @@ -1461,12 +1502,12 @@ def _separate_common_meta( meta_list = _dict_to_list_converter(proj_list=meta_list) return meta_list, new_meta_project - def _download_SRA_file(self, run_name: str): + def _download_SRA_file(self, run_name: str) -> None: """ - Download SRA file by ising 'prefetch' utility from the SRA Toolkit - more info: (http://www.ncbi.nlm.nih.gov/books/NBK242621/) + Download SRA file using 'prefetch' from the SRA Toolkit. - :param str run_name: SRR number of the SRA file + Args: + run_name: SRR number of the SRA file. """ # Set up a simple loop to try a few times in case of failure @@ -1488,12 +1529,13 @@ def _download_SRA_file(self, run_name: str): _LOGGER.info("Prefetch attempt failed, wait a few seconds to try again") time.sleep(t * 2) - def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn: + def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> None: """ - Convert SRA file to BAM file by using samtools function "sam-dump" + Convert SRA file to BAM file using sam-dump. - :param str bam_file: path to BAM file that has to be created - :param str run_name: SRR number of the SRA file that has to be converted + Args: + bam_file: Path to BAM file that has to be created. + run_name: SRR number of the SRA file that has to be converted. """ _LOGGER.info("Converting to bam: " + run_name) sra_file = os.path.join(self.sra_folder, run_name + ".sra") @@ -1514,14 +1556,17 @@ def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoRet run_subprocess(cmd, shell=True) def _sra_to_bam_conversion_fastq_damp( - self, bam_file: str, run_name: str, picard_path: str = None - ) -> NoReturn: + self, bam_file: str, run_name: str, picard_path: str | None = None + ) -> None: """ - Convert SRA file to BAM file by using fastq-dump - (is used when sam-dump fails, yielding an empty bam file. Here fastq -> bam conversion is used) - :param str bam_file: path to BAM file that has to be created - :param str run_name: SRR number of the SRA file that has to be converted - :param str picard_path: Path to The Picard toolkit. More info: https://broadinstitute.github.io/picard/ + Convert SRA file to BAM file using fastq-dump. + + Used when sam-dump fails, yielding an empty bam file. + + Args: + bam_file: Path to BAM file that has to be created. + run_name: SRR number of the SRA file that has to be converted. + picard_path: Path to the Picard toolkit. """ # check to make sure it worked @@ -1555,18 +1600,19 @@ def _sra_to_bam_conversion_fastq_damp( run_subprocess(cmd, shell=True) def _write_subannotation( - self, tabular_data: dict, filepath: str, column_names: list = None - ): + self, tabular_data: dict, filepath: str, column_names: list[str] | None = None + ) -> str: """ Write one or more tables to a given CSV filepath. - :param tabular_data: Mapping | Iterable[Mapping]: single KV pair collection, or collection - of such collections, to write to disk as tabular data - :param str filepath: path to file to write, possibly with environment - variables included, e.g. from a config file - :param Iterable[str] column_names: collection of names for columns to - write - :return str: path to file written + Args: + tabular_data: Single KV pair collection, or collection of such collections, + to write to disk as tabular data. + filepath: Path to file to write, possibly with environment variables. + column_names: Collection of names for columns to write. + + Returns: + Path to file written. """ _LOGGER.info(f"Sample subannotation sheet: {filepath}") fp = expandpath(filepath) @@ -1584,14 +1630,20 @@ def _write_subannotation( return fp def _download_file( - self, file_url: str, data_folder: str, new_name: str = None, sleep_after=0.5 - ) -> NoReturn: + self, + file_url: str, + data_folder: str, + new_name: str | None = None, + sleep_after: float = 0.5, + ) -> None: """ - Given an url for a file, downloading file to specified folder - :param str file_url: the URL of the file to download - :param str data_folder: path to the folder where data should be downloaded - :param float sleep_after: time to sleep after downloading - :param str new_name: new file name in the + Download a file from a URL to a specified folder. + + Args: + file_url: The URL of the file to download. + data_folder: Path to the folder where data should be downloaded. + new_name: New file name to use. + sleep_after: Time to sleep after downloading. """ filename = os.path.basename(file_url) if new_name is None: @@ -1614,13 +1666,17 @@ def _download_file( _LOGGER.info(f"\033[38;5;242mFile {full_filepath} exists.\033[0m") def _get_list_of_processed_files( - self, file_gse_content: list, file_gsm_content: list - ) -> tuple: + self, file_gse_content: list[str], file_gsm_content: list[str] + ) -> tuple[list, list]: """ - Given a paths to GSE and GSM metafile create a list of dicts of metadata of processed files - :param list file_gse_content: list of lines of gse metafile - :param list file_gsm_content: list of lines of gse metafile - :return: tuple[list of metadata of processed sample files and series files] + Create a list of dicts of metadata of processed files from GSE and GSM content. + + Args: + file_gse_content: List of lines of GSE metafile. + file_gsm_content: List of lines of GSM metafile. + + Returns: + Tuple of (sample metadata list, series metadata list). """ tar_re = re.compile(r".*\.tar$") gse_numb = None @@ -1789,10 +1845,14 @@ def _get_list_of_processed_files( def _run_filter(self, meta_list: list, col_name: str = "file") -> list: """ - Filters files and metadata using Regular expression filter - :param meta_list: list of composed metadata - :param col_name: name of the column where file names are stored - :return: metadata list after file_name filter + Filter files and metadata using a regular expression filter. + + Args: + meta_list: List of composed metadata. + col_name: Name of the column where file names are stored. + + Returns: + Metadata list after filename filter. """ filtered_list = [] for meta_elem in meta_list: @@ -1805,12 +1865,16 @@ def _run_filter(self, meta_list: list, col_name: str = "file") -> list: return filtered_list - def _run_size_filter(self, meta_list, col_name="file_size"): + def _run_size_filter(self, meta_list: list, col_name: str = "file_size") -> list: """ - Filters files and metadata by file size column specified in meta_list - :param meta_list: list of composed metadata - :param col_name: name of the column where is size information stored - :return: metadata list after size filter + Filter files and metadata by file size. + + Args: + meta_list: List of composed metadata. + col_name: Name of the column where size information is stored. + + Returns: + Metadata list after size filter. """ if self.filter_size is not None: filtered_list = [] @@ -1830,11 +1894,14 @@ def _run_size_filter(self, meta_list, col_name="file_size"): def _download_processed_file(self, file_url: str, data_folder: str) -> bool: """ - Given a url for a file, download it, and extract anything passing the filter. - :param str file_url: the URL of the file to download - :param str data_folder: the local folder where the file should be saved - :return bool: True if the file is downloaded successfully; false if it does - not pass filters and is not downloaded. + Download a file from a URL, extracting anything passing the filter. + + Args: + file_url: The URL of the file to download. + data_folder: The local folder where the file should be saved. + + Returns: + True if the file is downloaded successfully. """ if not self.geo_folder: @@ -1871,13 +1938,19 @@ def _download_processed_file(self, file_url: str, data_folder: str) -> bool: if ntry > 4: raise e - def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): + def _get_SRA_meta( + self, + file_gse_content: list[str], + gsm_metadata: dict, + file_sra: str | None = None, + ) -> list: """ - Parse out the SRA project identifier from the GSE file + Parse out the SRA project identifier from the GSE file. - :param list file_gse_content: list of content of file_sde_content - :param dict gsm_metadata: dict of GSM metadata - :param str file_sra: full path to SRA.csv metafile that has to be downloaded + Args: + file_gse_content: List of content of GSE file. + gsm_metadata: Dict of GSM metadata. + file_sra: Full path to SRA.csv metafile that has to be downloaded. """ # acc_SRP = None @@ -1960,11 +2033,15 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): ) return [] - def _get_SRP_list(self, srp_number: str) -> list: + def _get_SRP_list(self, srp_number: str) -> list[dict]: """ - Get a list of srp by using requests and xml searching and getting list of dicts of SRRs - :param str srp_number: SRP number - :return: list of dicts of SRRs + Get a list of SRR dicts by querying NCBI for an SRP number. + + Args: + srp_number: SRP number. + + Returns: + List of dicts of SRR records. """ if not srp_number: _LOGGER.info("No srp number in this accession found") @@ -2004,15 +2081,18 @@ def _get_SRP_list(self, srp_number: str) -> list: return SRP_list def _read_gsm_metadata( - self, acc_GSE: str, acc_GSE_list: dict, file_gsm_content: list + self, acc_GSE: str, acc_GSE_list: dict, file_gsm_content: list[str] ) -> dict: """ - A simple state machine to parse SOFT formatted files (Here, the GSM file) + Parse SOFT formatted GSM file using a simple state machine. - :param str acc_GSE: GSE number (Series accession) - :param dict acc_GSE_list: list of GSE - :param list file_gsm_content: list of contents of gsm file - :return dict: dictionary of experiment information (gsm_metadata) + Args: + acc_GSE: GSE number (Series accession). + acc_GSE_list: Dict of GSE accessions. + file_gsm_content: List of contents of GSM file. + + Returns: + Dictionary of experiment information (gsm_metadata). """ gsm_metadata = {} @@ -2088,9 +2168,9 @@ def _read_gsm_metadata( _LOGGER.debug(f"(SRX accession: {found[0]})") srx_id = found[0] gsm_metadata[srx_id] = gsm_metadata.pop(current_sample_id) - gsm_metadata[srx_id][ - "gsm_id" - ] = current_sample_id # save the GSM id + gsm_metadata[srx_id]["gsm_id"] = ( + current_sample_id # save the GSM id + ) current_sample_id = srx_id current_sample_srx = True # GSM SOFT file parsed, save it in a list @@ -2102,15 +2182,17 @@ def _write( self, f_var_value: str, content: str, - msg_pre: str = None, + msg_pre: str | None = None, omit_newline: bool = False, - ): + ) -> None: """ - Save new file (used for config file) - :param f_var_value: path to the file - :param content: content of the file - :param msg_pre: msg that have to be printed - :param omit_newline: omit new line + Save a new file (used for config files). + + Args: + f_var_value: Path to the file. + content: Content of the file. + msg_pre: Message prefix to print. + omit_newline: Whether to omit trailing newline. """ fp = expandpath(f_var_value) _LOGGER.info((msg_pre or "") + fp) @@ -2120,8 +2202,8 @@ def _write( f.write("\n") -def main(): - """Run the script.""" +def main() -> None: + """Run the geofetch CLI.""" args = _parse_cmdl(sys.argv[1:]) args_dict = vars(args) args_dict["args"] = args diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py index 7b05a34..282e8b8 100755 --- a/geofetch/sraconvert.py +++ b/geofetch/sraconvert.py @@ -1,21 +1,22 @@ #!/usr/bin/env python +import argparse import os import sys -from ubiquerg import VersionInHelpParser +from importlib.metadata import PackageNotFoundError, version import logmuse import pypiper - -__version__ = "0.1.0" +from ubiquerg import VersionInHelpParser -def _parse_cmdl(cmdl): - description = """ The SRA data converter is a wrapper around sra-tools that - provides convenience functions for converting or deleting sra data in - various formats. - """ - parser = VersionInHelpParser(description=description) +def _parse_cmdl(cmdl: list[str]) -> argparse.Namespace: + """Parse command-line arguments for sraconvert.""" + parser = VersionInHelpParser( + description="The SRA data converter is a wrapper around sra-tools that " + "provides convenience functions for converting or deleting sra data in " + "various formats." + ) # parser = pypiper.add_pypiper_args(parser, args=["output-parent"]) parser.add_argument( "-m", @@ -38,7 +39,7 @@ def _parse_cmdl(cmdl): "--bamfolder", default=safe_echo("SRABAM"), help="Optional: Specify a location to store bam files " - "[Default: $SRABAM:" + safe_echo("SRABAM") + "]", + "(Default: $SRABAM:" + safe_echo("SRABAM") + ")", ) parser.add_argument( @@ -46,7 +47,7 @@ def _parse_cmdl(cmdl): "--fqfolder", default=safe_echo("SRAFQ"), help="Optional: Specify a location to store fastq files " - "[Default: $SRAFQ:" + safe_echo("SRAFQ") + "]", + "(Default: $SRAFQ:" + safe_echo("SRAFQ") + ")", ) parser.add_argument( @@ -54,7 +55,7 @@ def _parse_cmdl(cmdl): "--srafolder", default=safe_echo("SRARAW"), help="Optional: Specify a location to store pipeline output " - "[Default: $SRARAW:" + safe_echo("SRARAW") + "]", + "(Default: $SRARAW:" + safe_echo("SRARAW") + ")", ) parser.add_argument( @@ -72,8 +73,12 @@ def _parse_cmdl(cmdl): help="Name for sample to run", metavar="SAMPLE_NAME", ) + try: + _pkg_version = version("geofetch") + except PackageNotFoundError: + _pkg_version = "unknown" parser.add_argument( - "-V", "--version", action="version", version=f"%(prog)s {__version__}" + "-V", "--version", action="version", version=f"%(prog)s {_pkg_version}" ) parser.add_argument("-r", "--srr", required=True, nargs="+", help="SRR files") @@ -85,25 +90,19 @@ def _parse_cmdl(cmdl): return parser.parse_args(cmdl) -def safe_echo(var): - """Returns an environment variable if it exists, or an empty string if not""" +def safe_echo(var: str) -> str: + """Return an environment variable if it exists, or an empty string if not.""" return os.getenv(var, "") -def uniqify(seq): # Dave Kirby - """ - Return only unique items in a sequence, preserving order - - :param list seq: List of items to uniqify - :return list[object]: Original list with duplicates removed - """ - # Order preserving +def uniqify(seq: list) -> list: + """Return only unique items in a sequence, preserving order.""" seen = set() return [x for x in seq if x not in seen and not seen.add(x)] -def main(): - """Run the script.""" +def main() -> None: + """Run the sraconvert pipeline.""" cmdl = sys.argv[1:] args = _parse_cmdl(cmdl) global _LOGGER diff --git a/geofetch/utils.py b/geofetch/utils.py index 66bdd88..5055121 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -7,7 +7,6 @@ import subprocess import sys from io import StringIO -from typing import Dict, List, NoReturn, Union import requests @@ -24,23 +23,28 @@ def build_prefetch_command( - run_id: str, prefetch_path: str = "prefetch", max_size: Union[str, int] = None -) -> List[str]: + run_id: str, prefetch_path: str = "prefetch", max_size: str | int | None = None +) -> list[str]: cmd = [prefetch_path, run_id] if max_size is not None: cmd.extend(["--max-size", str(max_size)]) return cmd -def is_known_type(accn: str = None, typename: str = None): +def is_known_type(accn: str | None = None, typename: str | None = None) -> bool: """ Determine if the given accession is of a known type. - :param str accn: accession of interest - :param str typename: check this typename for known status rather - than parsing an accession - :return bool: whether the given accession is of a known type. - :raise TypeError: if neither argument is provided or one/both are empty. + Args: + accn: Accession of interest. + typename: Check this typename for known status rather + than parsing an accession. + + Returns: + Whether the given accession is of a known type. + + Raises: + TypeError: If neither argument is provided or one/both are empty. """ if not (accn or typename): raise TypeError("Specify either accession or accession typename") @@ -53,7 +57,12 @@ def is_known_type(accn: str = None, typename: str = None): return False -def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=None): +def parse_accessions( + input_arg: str, + metadata_folder: str, + just_metadata: bool = False, + max_size: str | int | None = None, +) -> dict[str, dict] | None: """ Create a list of GSE accessions, either from file or a single value. @@ -62,11 +71,12 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N interested in from that GSE#. An empty sample list means we should get all samples from that GSE#. This loop will create this dict. - :param input_arg: Input argument (GSE, or file) - :param str metadata_folder: path to folder for accession metadata - :param bool just_metadata: whether to only process metadata, not the - actual data associated with the accession - :param str | int max_size: argument for prefetch command's --max-size option + Args: + input_arg: Input argument (GSE, or file). + metadata_folder: Path to folder for accession metadata. + just_metadata: Whether to only process metadata, not the + actual data associated with the accession. + max_size: Argument for prefetch command's --max-size option. """ acc_GSE_list = {} @@ -133,12 +143,15 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N return acc_GSE_list -def parse_SOFT_line(line: str) -> dict: +def parse_SOFT_line(line: str) -> dict[str, str]: """ Parse SOFT formatted line, returning a dictionary with the key-value pair. - :param str line: A SOFT-formatted line to parse ( !key = value ) - :return dict[str, str]: A python Dict object representing the key-value. + Args: + line: A SOFT-formatted line to parse ( !key = value ). + + Returns: + A python Dict object representing the key-value. """ elems = line[1:].split("=") return {elems[0].rstrip(): "=".join(elems[1:]).lstrip()} @@ -147,12 +160,13 @@ def parse_SOFT_line(line: str) -> dict: class AccessionException(Exception): """Exceptional condition(s) dealing with accession number(s).""" - def __init__(self, reason: str = ""): + def __init__(self, reason: str = "") -> None: """ Optionally provide explanation for exceptional condition. - :param str reason: some context or perhaps just a value that - could not be interpreted as an accession + Args: + reason: Some context or perhaps just a value that + could not be interpreted as an accession. """ super(AccessionException, self).__init__(reason) @@ -160,58 +174,66 @@ def __init__(self, reason: str = ""): class SoftFileException(Exception): """Exceptional condition(s) dealing with accession number(s).""" - def __init__(self, reason: str = ""): + def __init__(self, reason: str = "") -> None: """ Optionally provide explanation for exceptional condition. - :param str reason: some context or perhaps just a value that - could not be interpreted as an accession + Args: + reason: Some context or perhaps just a value that + could not be interpreted as an accession. """ super(SoftFileException, self).__init__(reason) -class Accession(object): +class Accession: """Working with accession numbers.""" _LOGGER = logging.getLogger("{}.{}".format(__name__, "Accession")) - def __init__(self, accn, strict=True): + def __init__(self, accn: str, strict: bool = True) -> None: """ Create an instance with an accession and optionally a validation strictness flag. - :param str accn: accession - :param bool strict: strictness of the validation (whether to require - that the accession type is known here) - :raise AccessionException: if the given accession value isn't - prefixed with three characters followed by an integer, or if - strict validation is required and the accession type is unknown + Args: + accn: Accession. + strict: Strictness of the validation (whether to require + that the accession type is known here). + + Raises: + AccessionException: If the given accession value isn't + prefixed with three characters followed by an integer, or if + strict validation is required and the accession type is unknown. """ typename, number = self._validate(accn) if strict and not is_known_type(accn): raise AccessionException( - "Unknown accession type for '{}': '{}'; " - "supported types: {}".format(accn, typename, URL_BY_ACC.keys()) + "Unknown accession type for '{}': '{}'; supported types: {}".format( + accn, typename, URL_BY_ACC.keys() + ) ) self.accn = accn self.typename = typename.upper() def fetch_metadata( self, - outpath: str = None, - typename: str = None, + outpath: str | None = None, + typename: str | None = None, clean: bool = False, max_soft_size: int = 1073741824, - ) -> list: + ) -> list[str]: """ Fetch the metadata associated with this accession. - :param str typename: type indicating URL format, use type - parsed at construction if unspecified - :param str outpath: path to file to which to write output, optional - :param bool clean: if true, files won't be saved - :param int max_soft_size: max soft file size in bytes - :return: list of lines in soft file + Args: + outpath: Path to file to which to write output, optional. + typename: Type indicating URL format, use type + parsed at construction if unspecified. + clean: If true, files won't be saved. + max_soft_size: Max soft file size in bytes. + + Returns: + List of lines in soft file. """ typename = (typename or self.typename).upper() @@ -282,11 +304,15 @@ def fetch_metadata( return result_list @staticmethod - def _validate(accn: str): + def _validate(accn: str) -> tuple[str, int]: """ Determine if given value looks like an accession. - :param str accn: ordinary accession identifier. - :return: typename, number + + Args: + accn: Ordinary accession identifier. + + Returns: + Typename and number. """ typename, number = split_accn(accn) if len(typename) != 3: @@ -304,17 +330,22 @@ def _validate(accn: str): return typename, number @staticmethod - def accn_type_exception(accn: str, typename: str, include_known: bool = True): + def accn_type_exception( + accn: str, typename: str, include_known: bool = True + ) -> AccessionException: """ Create an exception instance based on an accession and a parsed unknown typename. - :param str accn: accession identifier from which unknown typename - was parsed - :param str typename: unknown typename that was parsed - :param bool include_known: whether to include the known - typenames in the exception message - :return AccessionException: the exception instance + Args: + accn: Accession identifier from which unknown typename + was parsed. + typename: Unknown typename that was parsed. + include_known: Whether to include the known + typenames in the exception message. + + Returns: + The exception instance. """ message = "Unknown accn type for '{}': '{}'".format(accn, typename) if include_known: @@ -322,13 +353,16 @@ def accn_type_exception(accn: str, typename: str, include_known: bool = True): return AccessionException(message) -def split_accn(accn: str): +def split_accn(accn: str) -> tuple[str, str]: """ Split accession into prefix and number, leaving suffix as text and converting the type prefix to uppercase. - :param str accn: ordinary accession identifier. - :return str, str: prefix and integral suffix + Args: + accn: Ordinary accession identifier. + + Returns: + Prefix and integral suffix. """ typename, number_text = accn[:3], accn[3:] return typename.upper(), number_text @@ -336,9 +370,13 @@ def split_accn(accn: str): def convert_size(size_str: str) -> int: """ - Converting size, that was provided as string with suffix - :param str size_str: size as string with suffix: gb, mb, kb or b - :return int: size as int value in bytes + Convert size provided as string with suffix to bytes. + + Args: + size_str: Size as string with suffix: gb, mb, kb or b. + + Returns: + Size as int value in bytes. """ abbreviation_dict = {"gb": 1073741824, "mb": 1048576, "kb": 1024, "b": 1} supported_formats = r"(\dgb|\dmb|\db|\dkb)$" @@ -361,11 +399,12 @@ def convert_size(size_str: str) -> int: return size_in_bytes -def clean_soft_files(meta_dir: str): +def clean_soft_files(meta_dir: str) -> None: """ - Cleaning, deleting all soft files after downloading files - and creating PEPs - :param str meta_dir: Path to the metadata files + Delete all soft files after downloading files and creating PEPs. + + Args: + meta_dir: Path to the metadata files. """ try: dir_files = os.listdir(meta_dir) @@ -382,8 +421,8 @@ def clean_soft_files(meta_dir: str): _LOGGER.debug("Can't clean soft files...folder doesn't exist") -def run_subprocess(*args, **kwargs): - """Wrapper to gracefully start and stop a running subprocess""" +def run_subprocess(*args, **kwargs) -> int: + """Wrapper to gracefully start and stop a running subprocess.""" p = subprocess.Popen(*args, **kwargs) try: return p.wait() @@ -397,12 +436,15 @@ def run_subprocess(*args, **kwargs): sys.exit(1) -def _get_list_of_keys(list_of_dict: list): +def _get_list_of_keys(list_of_dict: list[dict]) -> list[str]: """ - Getting list of all keys that are in the dictionaries in the list + Get list of all keys that are in the dictionaries in the list. + + Args: + list_of_dict: List of dicts with metadata. - :param list list_of_dict: list of dicts with metadata - :return list: list of dictionary keys + Returns: + List of dictionary keys. """ dict_keys = {"sample_name": None} @@ -414,20 +456,29 @@ def _get_list_of_keys(list_of_dict: list): return list(dict_keys.keys()) -def _get_value(all_line: str): +def _get_value(all_line: str) -> str: """ - :param all_line: string with key value. (e.g. '!Series_geo_accession = GSE188720') - :return: value (e.g. GSE188720) + Extract value from a key-value string. + + Args: + all_line: String with key value (e.g. '!Series_geo_accession = GSE188720'). + + Returns: + The extracted value (e.g. GSE188720). """ line_value = all_line.split("= ")[-1] return line_value.split(": ")[-1].rstrip("\n") -def _read_tar_filelist(raw_text: str) -> dict: +def _read_tar_filelist(raw_text: str) -> dict[str, dict]: """ - Creating list for supplementary files that are listed in "filelist.txt" - :param str raw_text: path to the file with information about files that are zipped ("filelist.txt") - :return dict: dict of supplementary file names and additional information + Create list for supplementary files that are listed in "filelist.txt". + + Args: + raw_text: Raw text content of the filelist. + + Returns: + Dict of supplementary file names and additional information. """ f = StringIO(raw_text) files_info = {} @@ -449,11 +500,15 @@ def _read_tar_filelist(raw_text: str) -> dict: return files_info -def _check_file_existance(meta_processed_sample: list) -> list: +def _check_file_existance(meta_processed_sample: list[dict]) -> list[dict]: """ - Checking if last element of the list has files. If list of files is empty deleting it - :param: meta_processed_sample: list with metadata dictionary - :return: list with metadata dictionary after processing + Check if last element of the list has files; delete it if empty. + + Args: + meta_processed_sample: List with metadata dictionaries. + + Returns: + List with metadata dictionaries after processing. """ nb = len(meta_processed_sample) - 1 if nb > -1: @@ -463,12 +518,15 @@ def _check_file_existance(meta_processed_sample: list) -> list: return meta_processed_sample -def _separate_list_of_files(meta_list: Union[list, dict], col_name: str = "files"): +def _separate_list_of_files( + meta_list: list | dict, col_name: str = "files" +) -> list[dict]: """ - This method is separating list of files (dict value) or just simple dict - into two different dicts - :param col_name: column name that should be added with filenames - :param meta_list: list, or dict with metadata + Separate list of files (dict value) into individual dicts. + + Args: + meta_list: List or dict with metadata. + col_name: Column name that should be added with filenames. """ separated_list = [] if isinstance(meta_list, list): @@ -500,15 +558,16 @@ def _update_columns( including new data and populating columns used by looper based on existing values in the mapping. - :param Mapping metadata: the key-value mapping to update - :param str experiment_name: name of the experiment from which these - data came and are associated; the key in the metadata mapping - for which the value is to be updated - :param str sample_name: name of the sample with which these data are - associated - :param str read_type: usually "single" or "paired," an indication of the - type of sequencing reads for this experiment - :return: updated metadata + Args: + metadata: The key-value mapping to update. + experiment_name: Name of the experiment; the key in the metadata + mapping for which the value is to be updated. + sample_name: Name of the sample with which these data are associated. + read_type: Usually "single" or "paired," an indication of the + type of sequencing reads for this experiment. + + Returns: + Updated metadata. """ exp = metadata[experiment_name] @@ -538,11 +597,7 @@ def _update_columns( def _sanitize_config_string(text: str) -> str: - """ - Function that sanitizes text in config file. - :param text: Any string that have to be sanitized - :return: sanitized strings - """ + """Sanitize text for use in config files.""" new_str = text new_str = new_str.replace('"', '\\"') new_str = new_str.replace("'", "''") @@ -550,11 +605,7 @@ def _sanitize_config_string(text: str) -> str: def _sanitize_name(name_str: str) -> str: - """ - Function that sanitizes strings. (Replace all odd characters) - :param str name_str: Any string value that has to be sanitized. - :return: sanitized strings - """ + """Sanitize a string by replacing special characters with underscores.""" new_str = name_str punctuation1 = r"""!"#$%&'()*,./:;<=>?@[\]^_`{|}~""" for odd_char in list(punctuation1): @@ -563,20 +614,20 @@ def _sanitize_name(name_str: str) -> str: return new_str -def _create_dot_yaml(file_path: str, yaml_path: str) -> NoReturn: +def _create_dot_yaml(file_path: str, yaml_path: str) -> None: """ - Function that creates .pep.yaml file that points to actual yaml file - :param str file_path: Path to the .pep.yaml file that we want to create - :param str yaml_path: path or name of the actual yaml file + Create a .pep.yaml file that points to the actual yaml file. + + Args: + file_path: Path to the .pep.yaml file to create. + yaml_path: Path or name of the actual yaml file. """ with open(file_path, "w+") as file: file.writelines(f"config_file: {yaml_path}") -def _which(program: str): - """ - return str: the path to a program to make sure it exists - """ +def _which(program: str) -> str | None: + """Return the path to a program, or None if not found.""" import os def is_exe(fp): @@ -595,15 +646,17 @@ def is_exe(fp): def _dict_to_list_converter( - proj_dict: Dict = None, proj_list: List = None -) -> Union[Dict, List]: + proj_dict: dict | None = None, proj_list: list | None = None +) -> dict | list: """ - Converter project dict to list and vice versa - dict -> list - list -> dict - :param proj_dict: project dictionary - :param proj_list: project list - :return: converted values + Convert project dict to list and vice versa. + + Args: + proj_dict: Project dictionary. + proj_list: Project list. + + Returns: + Converted values. """ if proj_dict is not None: new_meta_list = [] @@ -626,11 +679,15 @@ def _dict_to_list_converter( return meta_list -def _standardize_colnames(meta_list: Union[list, dict]) -> Union[list, dict]: +def _standardize_colnames(meta_list: list | dict) -> list | dict: """ - Standardize column names by lower-casing and underscore - :param list meta_list: list of dictionaries of samples - :return : list of dictionaries of samples with standard colnames + Standardize column names by lower-casing and underscore. + + Args: + meta_list: List of dictionaries of samples. + + Returns: + List of dictionaries of samples with standardized column names. """ # check if meta_list is dict and converting it to list input_is_dict = False @@ -658,10 +715,8 @@ def _standardize_colnames(meta_list: Union[list, dict]) -> Union[list, dict]: return new_metalist -def _separate_file_url(meta_list): - """ - This method is adding dict key without file_name without path - """ +def _separate_file_url(meta_list: list[dict]) -> list[dict]: + """Add file_url key and set file to basename only.""" separated_list = [] for meta_elem in meta_list: new_dict = meta_elem.copy() @@ -684,11 +739,9 @@ def _separate_file_url(meta_list): def make_sample_name_unique( - sanit_name: str, separated_list: list, new_number: int = 1 + sanit_name: str, separated_list: list[dict], new_number: int = 1 ) -> str: - """ - Check if name is unique for current sample - """ + """Check if name is unique for current sample, appending a number if not.""" if sanit_name not in [f["sample_name"] for f in separated_list]: return sanit_name elif f"{sanit_name}_{new_number}" not in [f["sample_name"] for f in separated_list]: @@ -697,15 +750,13 @@ def make_sample_name_unique( return make_sample_name_unique(sanit_name, separated_list, new_number + 1) -def _filter_gsm(meta_processed_samples: list, gsm_list: dict) -> list: +def _filter_gsm(meta_processed_samples: list[dict], gsm_list: dict) -> list[dict]: """ - Getting metadata list of all samples of one experiment and filtering it - by the list of GSM that was specified in the input files. - And then changing names of the sample names. + Filter metadata samples by a GSM list from the input file. - :param meta_processed_samples: list of metadata dicts of samples - :param gsm_list: list of dicts where GSM (samples) are keys and - sample names are values. Where values can be empty string + Args: + meta_processed_samples: List of metadata dicts of samples. + gsm_list: Dict where GSM accessions are keys and sample names are values. """ if gsm_list.keys(): @@ -722,13 +773,15 @@ def _filter_gsm(meta_processed_samples: list, gsm_list: dict) -> list: return meta_processed_samples -def _unify_list_keys(processed_meta_list: list) -> list: +def _unify_list_keys(processed_meta_list: list[dict]) -> list[dict]: """ - Unifying list of dicts with metadata, so every dict will have - same keys + Unify list of dicts so every dict has the same keys. + + Args: + processed_meta_list: List of dicts with metadata. - :param list processed_meta_list: list of dicts with metadata - :return list: list of unified dicts with metadata + Returns: + List of unified dicts with metadata. """ list_of_keys = _get_list_of_keys(processed_meta_list) for k in list_of_keys: @@ -738,11 +791,15 @@ def _unify_list_keys(processed_meta_list: list) -> list: return processed_meta_list -def gse_content_to_dict(gse_content: List[str]) -> Dict[str, dict]: +def gse_content_to_dict(gse_content: list[str]) -> dict[str, dict]: """ - Unpack gse soft file to dict - :param gse_content: list of strings of gse soft file - :return: dict of gse content + Unpack GSE soft file content to a dict. + + Args: + gse_content: List of strings of GSE soft file. + + Returns: + Dict of GSE content. """ gse_dict = {} for line in gse_content: @@ -761,11 +818,7 @@ def gse_content_to_dict(gse_content: List[str]) -> Dict[str, dict]: def is_prefetch_callable() -> bool: - """ - Test if the prefetch command can be run. - - :return: True if it is available. - """ + """Test if the prefetch command can be run.""" try: # Option -V means display version and then quit. subprocess.run( diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ce2e564 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[project] +name = "geofetch" +version = "0.12.11" +description = "Downloads data and metadata from GEO and SRA and creates standard PEPs." +readme = "README.md" +license = "BSD-2-Clause" +requires-python = ">=3.10" +authors = [ + { name = "Oleksandr Khoroshevskyi" }, + { name = "Nathan Sheffield" }, + { name = "Vince Reuter" }, + { name = "Nathan LeRoy" }, +] +keywords = ["project", "bioinformatics", "sequencing", "ngs", "workflow"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "colorama>=0.3.9", + "logmuse>=0.3.0", + "ubiquerg>=0.6.2", + "requests>=2.28.1", + "xmltodict>=0.13.0", + "pandas>=1.5.3", + "peppy>=0.40.6", + "rich>=12.5.1", + "coloredlogs>=15.0.1", + "piper>=0.14.4", +] + +[project.urls] +Homepage = "https://github.com/pepkit/geofetch/" + +[project.scripts] +geofetch = "geofetch.__main__:main" +sraconvert = "geofetch.sraconvert:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["geofetch/**"] + +[project.optional-dependencies] +test = [ + "pytest", +] + +[tool.pytest.ini_options] +addopts = "-rfE" +testpaths = ["tests"] + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "I"] +ignore = ["F403", "F405", "E501"] + +[tool.ruff.lint.isort] +known-first-party = ["geofetch"] diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index 9440999..0000000 --- a/requirements/requirements-all.txt +++ /dev/null @@ -1,10 +0,0 @@ -colorama>=0.3.9 -logmuse>=0.2.6 -ubiquerg>=0.6.2 -requests>=2.28.1 -xmltodict>=0.13.0 -pandas>=1.5.3 -peppy>=0.40.6 -rich>=12.5.1 -coloredlogs>=15.0.1 -piper>=0.14.4 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt deleted file mode 100644 index e69de29..0000000 diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt deleted file mode 100644 index f2b38f7..0000000 --- a/requirements/requirements-docs.txt +++ /dev/null @@ -1,2 +0,0 @@ -geofetch -https://github.com/databio/mkdocs-databio/archive/master.zip \ No newline at end of file diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt deleted file mode 100644 index aecaff0..0000000 --- a/requirements/requirements-test.txt +++ /dev/null @@ -1,4 +0,0 @@ -black -pytest -coveralls -pytest-cov diff --git a/setup.py b/setup.py deleted file mode 100644 index 4cd56f2..0000000 --- a/setup.py +++ /dev/null @@ -1,76 +0,0 @@ -#! /usr/bin/env python - -import os -import sys - -from setuptools import setup - -PACKAGE = "geofetch" -REQDIR = "requirements" - -# Additional keyword arguments for setup(). -extra = {} - -# Ordinary dependencies - - -def read_reqs(reqs_name): - deps = [] - with open(os.path.join(REQDIR, "requirements-{}.txt".format(reqs_name)), "r") as f: - for line in f: - if not line.strip(): - continue - # deps.append(line.split("=")[0].rstrip("<>")) - deps.append(line) - return deps - - -DEPENDENCIES = read_reqs("all") -extra["install_requires"] = DEPENDENCIES - -scripts = None - -with open("{}/_version.py".format(PACKAGE), "r") as versionfile: - version = versionfile.readline().split()[-1].strip("\"'\n") - -with open("README.md") as f: - long_description = f.read() - -setup( - name=PACKAGE, - packages=[PACKAGE], - version=version, - description="Downloads data and metadata from GEO and SRA and creates standard PEPs.", - long_description=long_description, - long_description_content_type="text/markdown", - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Topic :: Scientific/Engineering :: Bio-Informatics", - ], - keywords="project, bioinformatics, sequencing, ngs, workflow, GUI", - url="https://github.com/pepkit/{}/".format(PACKAGE), - author="Oleksandr Khoroshevskyi, Nathan Sheffield, Vince Reuter, Nathan LeRoy", - license="BSD2", - entry_points={ - "console_scripts": [ - "geofetch = geofetch.__main__:main", - "sraconvert = geofetch.sraconvert:main", - ], - }, - package_data={PACKAGE: ["templates/*"]}, - scripts=scripts, - include_package_data=True, - test_suite="tests", - tests_require=read_reqs("dev"), - setup_requires=( - ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] - ), - **extra, -)