Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import pandas as pd 

2from . import refseq 

3import asyncio 

4 

5 

6def preprocess(categories=None, base_dir=None, max_files=None, file_indexes=None): 

7 if not categories: 

8 categories = refseq.REFSEQ_CATEGORIES 

9 

10 if isinstance(categories, str): 

11 categories = [categories] 

12 

13 dfs = [] 

14 for name in categories: 

15 print(f"Preprocessing {name}", flush=True) 

16 category = refseq.RefSeqCategory(name=name, max_files=max_files, base_dir=base_dir) 

17 category_df = category.write_h5(file_indexes=file_indexes) 

18 dfs.append(category_df) 

19 

20 df = pd.concat(dfs, ignore_index=True) 

21 

22 return df 

23 

24 

25def download(categories=None, base_dir=None, max_files=None): 

26 if not categories: 

27 categories = refseq.REFSEQ_CATEGORIES 

28 

29 if isinstance(categories, str): 

30 categories = [categories] 

31 

32 for name in categories: 

33 category = refseq.RefSeqCategory(name=name, max_files=max_files, base_dir=base_dir) 

34 print(f"Downloading raw files for {category}") 

35 category.download_all() 

36 # asyncio.run(category.download_all())