Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2import torch 

3from fastai.torch_core import TensorBase 

4 

5vocab_to_int = {"A": 1, "C": 2, "G": 3, "T": 4, "N": 0} 

6int_to_vocab = dict(zip(vocab_to_int.values(), vocab_to_int.keys())) 

7 

8 

9class TensorDNA(TensorBase): 

10 def __str__(self): 

11 seq_str = self.as_chars() 

12 return f"{seq_str} [{len(seq_str)}]" 

13 

14 def show(self, ctx=None, **kwargs): 

15 return str(self) 

16 

17 def as_chars(self): 

18 items = self.tolist() 

19 truncate_at = 50 

20 if type(items) == int: 

21 items = [items] 

22 

23 length = len(items) 

24 

25 if length > truncate_at: 

26 midpoint = truncate_at // 2 

27 items = items[:midpoint] + [".."] + items[-midpoint:] 

28 chars = [int_to_vocab[x] if x in int_to_vocab else str(x) for x in items] 

29 seq_str = "".join(chars) 

30 return seq_str 

31 

32 def as_biopython(self): 

33 from Bio.Seq import Seq 

34 return Seq(self.as_chars()) 

35 

36 

37def dna_seq_to_numpy(seq) -> np.ndarray: 

38 """ 

39 Transforms a sequence from biopython to a numpy array. 

40 

41 Should this be a transform?? 

42 """ 

43 seq_as_numpy = np.array(str(seq).upper(), "c") 

44 seq_as_numpy = seq_as_numpy.view(np.uint8) 

45 # Ignore any characters in sequence which are below an ascii value of 'A' i.e. 65 

46 seq_as_numpy = seq_as_numpy[seq_as_numpy >= ord("A")] 

47 for character, value in vocab_to_int.items(): 

48 seq_as_numpy[seq_as_numpy == ord(character)] = value 

49 seq_as_numpy = seq_as_numpy[seq_as_numpy < len(vocab_to_int)] 

50 seq_as_numpy = np.array(seq_as_numpy, dtype="u1") 

51 return seq_as_numpy 

52 

53 

54def dna_seq_to_tensor(seq) -> TensorDNA: 

55 """ 

56 Transforms a a sequence from biopython to a TensorDNA tensor. 

57 

58 Should this be a pipeline? 

59 Can we use the ToTensor transform in fastai? 

60 """ 

61 t = TensorDNA(dna_seq_to_numpy(seq)) 

62 return t