Coverage for seqbank/transform.py: 100.00%
22 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-02 04:29 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-02 04:29 +0000
1import numpy as np
3vocab_to_int = {"A": 1, "C": 2, "G": 3, "T": 4, "N": 0}
4int_to_vocab = dict(zip(vocab_to_int.values(), vocab_to_int.keys()))
6DELETE = bytes(range(256)).translate(None, b"ACGTNacgtn")
7TABLE = bytearray(b"\0" * 256)
8REVERSE_TABLE = bytearray(b"\0" * 256)
10for char, value in vocab_to_int.items():
11 TABLE[ord(char)] = value
12 TABLE[ord(char.lower())] = value
14 REVERSE_TABLE[value] = ord(char)
17def seq_to_bytes(seq: str | bytes, delete: bool = True) -> bytes:
18 """
19 Convert a DNA sequence to a byte representation using a predefined mapping.
21 Args:
22 seq (str | bytes): The DNA sequence to convert. Can be a string or bytes.
23 delete (bool, optional): If True, delete characters that are not in 'ACGTN' or 'acgtn'. Defaults to True.
25 Returns:
26 bytes: The byte-encoded version of the input sequence.
27 """
28 if isinstance(seq, str):
29 seq = seq.encode("ascii")
31 assert isinstance(seq, bytes), f"Expected bytes, got {type(seq)}"
33 if delete:
34 return seq.translate(TABLE, DELETE)
35 return seq.translate(TABLE)
38def seq_to_numpy(seq: str | bytes, delete: bool = True) -> np.ndarray:
39 """
40 Convert a DNA sequence to a NumPy array of unsigned 8-bit integers.
42 Args:
43 seq (str | bytes): The DNA sequence to convert, in string or byte format.
44 delete (bool, optional): If True, remove any characters not in 'ACGTN'. Defaults to True.
46 Returns:
47 np.ndarray: A NumPy array representing the byte-encoded DNA sequence.
48 """
49 b = seq_to_bytes(seq, delete=delete)
50 return np.frombuffer(b, dtype="u1")
53def bytes_to_str(seq: bytes) -> str:
54 """
55 Convert byte-encoded DNA sequence back to a string.
57 Args:
58 seq (bytes): The byte-encoded DNA sequence.
60 Returns:
61 str: The original DNA sequence in string format.
62 """
63 return seq.translate(REVERSE_TABLE).decode("ascii")