Coverage for vorlagellm/rag.py: 67.53%

77 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-10-24 03:22 +0000

1from pathlib import Path 

2from langchain.schema import Document as EmbeddingDocument 

3from langchain_chroma import Chroma 

4from langchain_openai import OpenAIEmbeddings 

5from vorlagellm.tei import ( 

6 get_verses, 

7 get_reading_permutations, 

8 get_verse_text, 

9) 

10from rich.progress import track, Progress 

11 

12 

13def sentence_components(sentence:str, word_count:int=2) -> list[str]: 

14 words = sentence.split() 

15 return [" ".join(words[i:i+word_count]) for i in range(len(words) - word_count + 1) ] 

16 

17 

18def get_similar_verses(db, verse:str, window:int=0) -> dict[str,EmbeddingDocument]: 

19 verse_results = db.get(where={"verse": verse}, include=['embeddings', 'documents']) 

20 if not verse_results['embeddings']: 

21 print(f"Verse {verse} not found.") 

22 return 

23 

24 similar_docs = dict() 

25 for embedding_vector in verse_results['embeddings']: 

26 for similar in db.similarity_search_by_vector(embedding_vector): 

27 similar_verse = similar.metadata['verse'] 

28 

29 if similar_verse != verse and similar_verse not in similar_docs: 

30 similar_docs[similar_verse] = similar 

31 

32 if window: 

33 for verse_text in verse_results['documents']: 

34 for components in sentence_components(verse_text, window): 

35 for similar in db.similarity_search(components): 

36 similar_verse = similar.metadata['verse'] 

37 

38 if similar_verse != verse and similar_verse not in similar_docs: 

39 similar_docs[similar_verse] = similar 

40 

41 return similar_docs 

42 

43 

44def get_similar_verses_by_phrase(db, phrase:str) -> set[str]: 

45 similar_verses = set() 

46 for similar in db.similarity_search(phrase): 

47 similar_verses.add(similar.metadata['verse']) 

48 

49 return similar_verses 

50 

51 

52def build_apparatus_embeddingdocs(apparatus, ignore_types:list[str]|None=None) -> list[EmbeddingDocument]: 

53 documents = [] 

54 verses = get_verses(apparatus) 

55 for verse in track(verses): 

56 permutations = get_reading_permutations(apparatus, verse, ignore_types=ignore_types, max_permutations=10) 

57 for ii, permutation in enumerate(permutations): 

58 metadata = dict( 

59 index=ii, 

60 verse=verse, 

61 ) 

62 document= EmbeddingDocument(page_content=permutation.text, metadata=metadata) 

63 documents.append(document) 

64 return documents 

65 

66 

67def build_teidoc_embeddingdocs(teidoc) -> list[EmbeddingDocument]: 

68 documents = [] 

69 for verse in get_verses(teidoc): 

70 text = get_verse_text(teidoc, verse) 

71 metadata = dict( 

72 verse=verse, 

73 ) 

74 document= EmbeddingDocument(page_content=text, metadata=metadata) 

75 documents.append(document) 

76 return documents 

77 

78 

79def get_db(docs:list[EmbeddingDocument], model:OpenAIEmbeddings, path:Path|str) -> Chroma: 

80 if Path(path).exists(): 

81 db = Chroma(persist_directory=str(path), embedding_function=model) 

82 else: 

83 print(f"Embedding {len(docs)} items to {path}") 

84 batch_size = 100 

85 num_batches = len(docs) // batch_size + (1 if len(docs) % batch_size != 0 else 0) 

86 

87 db = None 

88 

89 with Progress() as progress: 

90 task = progress.add_task("[cyan]Indexing documents...", total=len(docs)) 

91 

92 for i in range(num_batches): 

93 batch = docs[i * batch_size:(i + 1) * batch_size] 

94 if i == 0: 

95 db = Chroma.from_documents( 

96 documents=batch, 

97 embedding=model, 

98 persist_directory=str(path), 

99 ) 

100 else: 

101 db.add_documents(batch) 

102 progress.update(task, advance=len(batch)) 

103 return db 

104 

105 

106def get_apparatus_db(apparatus, model:OpenAIEmbeddings, path:Path|str, ignore_types:list[str]|None=None) -> Chroma: 

107 if path and Path(path).exists(): 

108 return get_db(None, model, path) 

109 items = build_apparatus_embeddingdocs(apparatus, ignore_types=ignore_types) 

110 return get_db(items, model, path) 

111 

112 

113def get_teidoc_db(teidoc, model:OpenAIEmbeddings, path:Path|str) -> Chroma: 

114 if path and Path(path).exists(): 

115 return get_db(None, model, path) 

116 items = build_teidoc_embeddingdocs(teidoc) 

117 return get_db(items, model, path)