Coverage for vorlagellm/rag.py: 67.53%
77 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
1from pathlib import Path
2from langchain.schema import Document as EmbeddingDocument
3from langchain_chroma import Chroma
4from langchain_openai import OpenAIEmbeddings
5from vorlagellm.tei import (
6 get_verses,
7 get_reading_permutations,
8 get_verse_text,
9)
10from rich.progress import track, Progress
13def sentence_components(sentence:str, word_count:int=2) -> list[str]:
14 words = sentence.split()
15 return [" ".join(words[i:i+word_count]) for i in range(len(words) - word_count + 1) ]
18def get_similar_verses(db, verse:str, window:int=0) -> dict[str,EmbeddingDocument]:
19 verse_results = db.get(where={"verse": verse}, include=['embeddings', 'documents'])
20 if not verse_results['embeddings']:
21 print(f"Verse {verse} not found.")
22 return
24 similar_docs = dict()
25 for embedding_vector in verse_results['embeddings']:
26 for similar in db.similarity_search_by_vector(embedding_vector):
27 similar_verse = similar.metadata['verse']
29 if similar_verse != verse and similar_verse not in similar_docs:
30 similar_docs[similar_verse] = similar
32 if window:
33 for verse_text in verse_results['documents']:
34 for components in sentence_components(verse_text, window):
35 for similar in db.similarity_search(components):
36 similar_verse = similar.metadata['verse']
38 if similar_verse != verse and similar_verse not in similar_docs:
39 similar_docs[similar_verse] = similar
41 return similar_docs
44def get_similar_verses_by_phrase(db, phrase:str) -> set[str]:
45 similar_verses = set()
46 for similar in db.similarity_search(phrase):
47 similar_verses.add(similar.metadata['verse'])
49 return similar_verses
52def build_apparatus_embeddingdocs(apparatus, ignore_types:list[str]|None=None) -> list[EmbeddingDocument]:
53 documents = []
54 verses = get_verses(apparatus)
55 for verse in track(verses):
56 permutations = get_reading_permutations(apparatus, verse, ignore_types=ignore_types, max_permutations=10)
57 for ii, permutation in enumerate(permutations):
58 metadata = dict(
59 index=ii,
60 verse=verse,
61 )
62 document= EmbeddingDocument(page_content=permutation.text, metadata=metadata)
63 documents.append(document)
64 return documents
67def build_teidoc_embeddingdocs(teidoc) -> list[EmbeddingDocument]:
68 documents = []
69 for verse in get_verses(teidoc):
70 text = get_verse_text(teidoc, verse)
71 metadata = dict(
72 verse=verse,
73 )
74 document= EmbeddingDocument(page_content=text, metadata=metadata)
75 documents.append(document)
76 return documents
79def get_db(docs:list[EmbeddingDocument], model:OpenAIEmbeddings, path:Path|str) -> Chroma:
80 if Path(path).exists():
81 db = Chroma(persist_directory=str(path), embedding_function=model)
82 else:
83 print(f"Embedding {len(docs)} items to {path}")
84 batch_size = 100
85 num_batches = len(docs) // batch_size + (1 if len(docs) % batch_size != 0 else 0)
87 db = None
89 with Progress() as progress:
90 task = progress.add_task("[cyan]Indexing documents...", total=len(docs))
92 for i in range(num_batches):
93 batch = docs[i * batch_size:(i + 1) * batch_size]
94 if i == 0:
95 db = Chroma.from_documents(
96 documents=batch,
97 embedding=model,
98 persist_directory=str(path),
99 )
100 else:
101 db.add_documents(batch)
102 progress.update(task, advance=len(batch))
103 return db
106def get_apparatus_db(apparatus, model:OpenAIEmbeddings, path:Path|str, ignore_types:list[str]|None=None) -> Chroma:
107 if path and Path(path).exists():
108 return get_db(None, model, path)
109 items = build_apparatus_embeddingdocs(apparatus, ignore_types=ignore_types)
110 return get_db(items, model, path)
113def get_teidoc_db(teidoc, model:OpenAIEmbeddings, path:Path|str) -> Chroma:
114 if path and Path(path).exists():
115 return get_db(None, model, path)
116 items = build_teidoc_embeddingdocs(teidoc)
117 return get_db(items, model, path)