Coverage for seqbank/transform.py: 100.00%

22 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-12-02 04:29 +0000

1import numpy as np 

2 

3vocab_to_int = {"A": 1, "C": 2, "G": 3, "T": 4, "N": 0} 

4int_to_vocab = dict(zip(vocab_to_int.values(), vocab_to_int.keys())) 

5 

6DELETE = bytes(range(256)).translate(None, b"ACGTNacgtn") 

7TABLE = bytearray(b"\0" * 256) 

8REVERSE_TABLE = bytearray(b"\0" * 256) 

9 

10for char, value in vocab_to_int.items(): 

11 TABLE[ord(char)] = value 

12 TABLE[ord(char.lower())] = value 

13 

14 REVERSE_TABLE[value] = ord(char) 

15 

16 

17def seq_to_bytes(seq: str | bytes, delete: bool = True) -> bytes: 

18 """ 

19 Convert a DNA sequence to a byte representation using a predefined mapping. 

20 

21 Args: 

22 seq (str | bytes): The DNA sequence to convert. Can be a string or bytes. 

23 delete (bool, optional): If True, delete characters that are not in 'ACGTN' or 'acgtn'. Defaults to True. 

24 

25 Returns: 

26 bytes: The byte-encoded version of the input sequence. 

27 """ 

28 if isinstance(seq, str): 

29 seq = seq.encode("ascii") 

30 

31 assert isinstance(seq, bytes), f"Expected bytes, got {type(seq)}" 

32 

33 if delete: 

34 return seq.translate(TABLE, DELETE) 

35 return seq.translate(TABLE) 

36 

37 

38def seq_to_numpy(seq: str | bytes, delete: bool = True) -> np.ndarray: 

39 """ 

40 Convert a DNA sequence to a NumPy array of unsigned 8-bit integers. 

41 

42 Args: 

43 seq (str | bytes): The DNA sequence to convert, in string or byte format. 

44 delete (bool, optional): If True, remove any characters not in 'ACGTN'. Defaults to True. 

45 

46 Returns: 

47 np.ndarray: A NumPy array representing the byte-encoded DNA sequence. 

48 """ 

49 b = seq_to_bytes(seq, delete=delete) 

50 return np.frombuffer(b, dtype="u1") 

51 

52 

53def bytes_to_str(seq: bytes) -> str: 

54 """ 

55 Convert byte-encoded DNA sequence back to a string. 

56 

57 Args: 

58 seq (bytes): The byte-encoded DNA sequence. 

59 

60 Returns: 

61 str: The original DNA sequence in string format. 

62 """ 

63 return seq.translate(REVERSE_TABLE).decode("ascii")