Coverage for seqbank/dfam.py: 100.00%
39 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-02 04:29 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-02 04:29 +0000
1import gzip
2from pathlib import Path
3import h5py
4import tempfile
5import shutil
7from .io import download_file
8from .seqbank import SeqBank
11def dfam_url(curated: bool = True, release: str = "current") -> str:
12 """
13 Constructs the URL for the Dfam database file.
15 Args:
16 curated (bool, optional): If True, fetches the curated-only version of the Dfam database. Defaults to True.
17 release (str, optional): The Dfam release version. Defaults to "current".
19 Returns:
20 str: The URL for the Dfam HDF5 file.
21 """
22 curated_str = "_curatedonly" if curated else ""
23 url = f"https://www.dfam.org/releases/{release}/families/Dfam{curated_str}.h5.gz"
24 return url
27def add_dfam(seqbank: SeqBank, local_path: Path) -> None:
28 """
29 Adds sequences from a local Dfam HDF5 file to the SeqBank.
31 Args:
32 seqbank (SeqBank): The SeqBank instance to add sequences to.
33 local_path (Path): The path to the local Dfam HDF5 file.
34 """
35 file = h5py.File(local_path, "r")
37 def visitor_func(name, node):
38 if isinstance(node, h5py.Dataset):
39 accession = node.attrs["accession"]
40 seq = node.attrs["consensus"]
41 seqbank.add(seq=seq, accession=accession)
43 file["Families/DF"].visititems(visitor_func)
46def download_dfam(seqbank: SeqBank, curated: bool = True, release: str = "current", force: bool = False) -> bool:
47 """
48 Downloads the Dfam HDF5 file, decompresses it, and adds sequences to the SeqBank.
50 Args:
51 seqbank (SeqBank): The SeqBank instance to which the Dfam sequences will be added.
52 curated (bool, optional): If True, fetches the curated-only version of the Dfam database. Defaults to True.
53 release (str, optional): The Dfam release version. Defaults to "current".
54 force (bool, optional): If True, forces the download even if the URL has been previously processed. Defaults to False.
56 Returns:
57 bool: True if the download and addition were successful, False otherwise.
58 """
59 url = dfam_url(curated, release)
60 url_key = seqbank.key_url(url)
62 if url_key in seqbank.file and not force:
63 print(f"Already downloaded: {url}")
64 return False
66 with tempfile.TemporaryDirectory() as tmpdirname:
67 local_gzip_path = Path(tmpdirname) / Path(url).name
68 try:
69 print(f"Downloading Dfam: {url}")
70 download_file(url, local_gzip_path)
72 # Decompress the gzipped file
73 local_path = local_gzip_path.with_suffix("")
74 with gzip.open(local_gzip_path, "rb") as infile, open(local_path, "wb") as outfile:
75 shutil.copyfileobj(infile, outfile)
77 # Add sequences to SeqBank
78 add_dfam(seqbank, local_path)
80 # Mark URL as processed
81 seqbank.save_seen_url(url)
82 except Exception as err:
83 print(f"Failed to add Dfam: {url}: {err}")
84 return False
86 return True