Coverage for seqbank/dfam.py: 100.00%

39 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-12-02 04:29 +0000

1import gzip 

2from pathlib import Path 

3import h5py 

4import tempfile 

5import shutil 

6 

7from .io import download_file 

8from .seqbank import SeqBank 

9 

10 

11def dfam_url(curated: bool = True, release: str = "current") -> str: 

12 """ 

13 Constructs the URL for the Dfam database file. 

14 

15 Args: 

16 curated (bool, optional): If True, fetches the curated-only version of the Dfam database. Defaults to True. 

17 release (str, optional): The Dfam release version. Defaults to "current". 

18 

19 Returns: 

20 str: The URL for the Dfam HDF5 file. 

21 """ 

22 curated_str = "_curatedonly" if curated else "" 

23 url = f"https://www.dfam.org/releases/{release}/families/Dfam{curated_str}.h5.gz" 

24 return url 

25 

26 

27def add_dfam(seqbank: SeqBank, local_path: Path) -> None: 

28 """ 

29 Adds sequences from a local Dfam HDF5 file to the SeqBank. 

30 

31 Args: 

32 seqbank (SeqBank): The SeqBank instance to add sequences to. 

33 local_path (Path): The path to the local Dfam HDF5 file. 

34 """ 

35 file = h5py.File(local_path, "r") 

36 

37 def visitor_func(name, node): 

38 if isinstance(node, h5py.Dataset): 

39 accession = node.attrs["accession"] 

40 seq = node.attrs["consensus"] 

41 seqbank.add(seq=seq, accession=accession) 

42 

43 file["Families/DF"].visititems(visitor_func) 

44 

45 

46def download_dfam(seqbank: SeqBank, curated: bool = True, release: str = "current", force: bool = False) -> bool: 

47 """ 

48 Downloads the Dfam HDF5 file, decompresses it, and adds sequences to the SeqBank. 

49 

50 Args: 

51 seqbank (SeqBank): The SeqBank instance to which the Dfam sequences will be added. 

52 curated (bool, optional): If True, fetches the curated-only version of the Dfam database. Defaults to True. 

53 release (str, optional): The Dfam release version. Defaults to "current". 

54 force (bool, optional): If True, forces the download even if the URL has been previously processed. Defaults to False. 

55 

56 Returns: 

57 bool: True if the download and addition were successful, False otherwise. 

58 """ 

59 url = dfam_url(curated, release) 

60 url_key = seqbank.key_url(url) 

61 

62 if url_key in seqbank.file and not force: 

63 print(f"Already downloaded: {url}") 

64 return False 

65 

66 with tempfile.TemporaryDirectory() as tmpdirname: 

67 local_gzip_path = Path(tmpdirname) / Path(url).name 

68 try: 

69 print(f"Downloading Dfam: {url}") 

70 download_file(url, local_gzip_path) 

71 

72 # Decompress the gzipped file 

73 local_path = local_gzip_path.with_suffix("") 

74 with gzip.open(local_gzip_path, "rb") as infile, open(local_path, "wb") as outfile: 

75 shutil.copyfileobj(infile, outfile) 

76 

77 # Add sequences to SeqBank 

78 add_dfam(seqbank, local_path) 

79 

80 # Mark URL as processed 

81 seqbank.save_seen_url(url) 

82 except Exception as err: 

83 print(f"Failed to add Dfam: {url}: {err}") 

84 return False 

85 

86 return True