Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import numpy as np
2import torch
3from fastai.torch_core import TensorBase
5vocab_to_int = {"A": 1, "C": 2, "G": 3, "T": 4, "N": 0}
6int_to_vocab = dict(zip(vocab_to_int.values(), vocab_to_int.keys()))
9class TensorDNA(TensorBase):
10 def __str__(self):
11 seq_str = self.as_chars()
12 return f"{seq_str} [{len(seq_str)}]"
14 def show(self, ctx=None, **kwargs):
15 return str(self)
17 def as_chars(self):
18 items = self.tolist()
19 truncate_at = 50
20 if type(items) == int:
21 items = [items]
23 length = len(items)
25 if length > truncate_at:
26 midpoint = truncate_at // 2
27 items = items[:midpoint] + [".."] + items[-midpoint:]
28 chars = [int_to_vocab[x] if x in int_to_vocab else str(x) for x in items]
29 seq_str = "".join(chars)
30 return seq_str
32 def as_biopython(self):
33 from Bio.Seq import Seq
34 return Seq(self.as_chars())
37def dna_seq_to_numpy(seq) -> np.ndarray:
38 """
39 Transforms a sequence from biopython to a numpy array.
41 Should this be a transform??
42 """
43 seq_as_numpy = np.array(str(seq).upper(), "c")
44 seq_as_numpy = seq_as_numpy.view(np.uint8)
45 # Ignore any characters in sequence which are below an ascii value of 'A' i.e. 65
46 seq_as_numpy = seq_as_numpy[seq_as_numpy >= ord("A")]
47 for character, value in vocab_to_int.items():
48 seq_as_numpy[seq_as_numpy == ord(character)] = value
49 seq_as_numpy = seq_as_numpy[seq_as_numpy < len(vocab_to_int)]
50 seq_as_numpy = np.array(seq_as_numpy, dtype="u1")
51 return seq_as_numpy
54def dna_seq_to_tensor(seq) -> TensorDNA:
55 """
56 Transforms a a sequence from biopython to a TensorDNA tensor.
58 Should this be a pipeline?
59 Can we use the ToTensor transform in fastai?
60 """
61 t = TensorDNA(dna_seq_to_numpy(seq))
62 return t