Coverage for seqbank/refseq.py: 100.00%
14 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-02 04:29 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-02 04:29 +0000
1from pathlib import Path
2import re
3from .io import download_file, TemporaryDirectory
6def get_refseq_filenames(tmp_dir: str | Path | None = None) -> list[str]:
7 """
8 Retrieves a list of RefSeq genomic filenames from the NCBI FTP site.
10 Args:
11 tmp_dir (str | Path | None, optional):
12 The directory to create a temporary folder in. If None, a default temporary directory is used.
14 Returns:
15 list[str]: A list of filenames sorted numerically by version.
16 """
17 with TemporaryDirectory(prefix=tmp_dir) as dirname:
18 local_path = dirname / "refseq_complete.html"
20 download_file("https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/", local_path)
21 text = local_path.read_text()
23 # Find all .genomic.fna.gz filenames
24 filenames = re.findall(r">(.*?.genomic.fna.gz)<\/a>", text)
26 # Sort filenames numerically by version number (second part of filename)
27 filenames = sorted(filenames, key=lambda filename: int(filename.split(".")[1]))
29 return filenames
32def get_refseq_urls(tmp_dir: str | Path | None = None) -> list[str]:
33 """
34 Retrieves a list of URLs for RefSeq genomic files from the NCBI FTP site.
36 Args:
37 tmp_dir (str | Path | None, optional):
38 The directory to create a temporary folder in. If None, a default temporary directory is used.
40 Returns:
41 list[str]: A list of URLs for the RefSeq genomic files.
42 """
43 filenames = get_refseq_filenames(tmp_dir=tmp_dir)
44 return [f"https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/{filename}" for filename in filenames]