Coverage for seqbank/refseq.py: 100.00%

14 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-12-02 04:29 +0000

1from pathlib import Path 

2import re 

3from .io import download_file, TemporaryDirectory 

4 

5 

6def get_refseq_filenames(tmp_dir: str | Path | None = None) -> list[str]: 

7 """ 

8 Retrieves a list of RefSeq genomic filenames from the NCBI FTP site. 

9 

10 Args: 

11 tmp_dir (str | Path | None, optional): 

12 The directory to create a temporary folder in. If None, a default temporary directory is used. 

13 

14 Returns: 

15 list[str]: A list of filenames sorted numerically by version. 

16 """ 

17 with TemporaryDirectory(prefix=tmp_dir) as dirname: 

18 local_path = dirname / "refseq_complete.html" 

19 

20 download_file("https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/", local_path) 

21 text = local_path.read_text() 

22 

23 # Find all .genomic.fna.gz filenames 

24 filenames = re.findall(r">(.*?.genomic.fna.gz)<\/a>", text) 

25 

26 # Sort filenames numerically by version number (second part of filename) 

27 filenames = sorted(filenames, key=lambda filename: int(filename.split(".")[1])) 

28 

29 return filenames 

30 

31 

32def get_refseq_urls(tmp_dir: str | Path | None = None) -> list[str]: 

33 """ 

34 Retrieves a list of URLs for RefSeq genomic files from the NCBI FTP site. 

35 

36 Args: 

37 tmp_dir (str | Path | None, optional): 

38 The directory to create a temporary folder in. If None, a default temporary directory is used. 

39 

40 Returns: 

41 list[str]: A list of URLs for the RefSeq genomic files. 

42 """ 

43 filenames = get_refseq_filenames(tmp_dir=tmp_dir) 

44 return [f"https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/{filename}" for filename in filenames]