Coverage for vorlagellm/main.py: 52.98%
168 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
1import typer
2from typing_extensions import Annotated
3from pathlib import Path
4from rich.progress import track
5from rich.console import Console
6from langchain_openai import OpenAIEmbeddings
7import llmloader
9from .chains import build_corresponding_text_chain, build_source_chain
10from .prompts import readings_list_to_str
11from .rag import get_apparatus_db, get_teidoc_db, get_db, get_similar_verses, get_similar_verses_by_phrase
12from .agreements import count_witness_agreements, WitnessComparison
13from vorlagellm.tei import (
14 read_tei,
15 get_siglum,
16 add_siglum,
17 get_language,
18 get_verses,
19 get_reading_permutations,
20 find_readings,
21 get_verse_text,
22 add_doc_metadata,
23 add_witness_readings,
24 write_tei,
25 add_wit_detail,
26 find_elements,
27 get_language_code,
28 get_verse_element,
29 add_responsibility_statement_llm,
30 extract_text,
31 reading_has_witness,
32 get_apparatus_verse_text,
33 write_elements,
34 find_parent,
35 app_has_witness,
36)
37from .ensemble import do_ensemble
39console = Console()
41app = typer.Typer()
43DEFAULT_MODEL_ID = "gpt-4.1"
44DEFAULT_EMBEDDING_MODEL_ID = "text-embedding-3-large"
47@app.command()
48def run(
49 doc: Path,
50 apparatus: Path,
51 output:Path,
52 api_key:str="",
53 model:str=DEFAULT_MODEL_ID,
54 apparatus_db:Path=None,
55 doc_db:Path=None,
56 siglum:str="",
57 notes:Path=None,
58 include:list[str]=None,
59 ignore:list[str]=None,
60 initiate_response:bool=False,
61):
62 """ Runs the main VorlageLLM pipeline on a document to predict which source readings from an apparatus could have produced its text. """
63 llm = llmloader.load(model=model, api_key=api_key)
64 doc_path = doc
65 doc = read_tei(doc_path)
66 apparatus_path = apparatus
67 apparatus = read_tei(apparatus_path)
69 # Add as witness to apparatus
70 siglum = siglum or get_siglum(doc)
71 assert siglum, f"Could not determine siglum in '{doc_path}'. Please add a siglum to the TEI XML or add a siglum in the command line with --siglum"
72 witness_element = add_siglum(apparatus, siglum)
74 if notes and Path(notes).exists():
75 notes = Path(notes).read_text()
76 else:
77 notes = ""
79 # Add responsibility statement
80 _, resp_id = add_responsibility_statement_llm(apparatus, siglum, model)
82 # Add metadata to apparatus
83 add_doc_metadata(witness_element, doc)
85 # Get languages
86 doc_language = get_language(doc)
87 doc_language_code = get_language_code(doc)
88 assert doc_language, f"Could not determine language of document {doc_path}"
90 apparatus_language = get_language(apparatus)
91 assert apparatus_language, f"Could not determine language of apparatus {apparatus_path}"
93 # Create database for apparatus
94 if doc_db:
95 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
96 doc_db = get_teidoc_db(doc, model=embeddings_model, path=doc_db)
98 if apparatus_db:
99 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
100 apparatus_db = get_apparatus_db(apparatus, model=embeddings_model, path=apparatus_db, ignore_types=ignore)
102 # Create chain to use
103 corresponding_text_chain = build_corresponding_text_chain(llm, doc_language=doc_language, apparatus_language=apparatus_language, initiate_response=initiate_response)
104 source_chain = build_source_chain(llm, doc_language=doc_language, apparatus_language=apparatus_language, notes=notes, initiate_response=initiate_response)
106 verses = get_verses(apparatus)
107 if include:
108 verses = [v for v in verses if v in include]
110 for verse in verses:
111 doc_verse_text = get_verse_text(doc, verse)
112 console.rule(f"Verse '{verse}'", style="bold red")
113 console.print(f"Text: {doc_verse_text}")
114 apparatus_verse_element = get_verse_element(apparatus, verse)
116 for app in find_elements(apparatus_verse_element, ".//app"):
117 if app_has_witness(app, siglum):
118 continue
120 readings = find_readings(app, ignore_types=ignore)
121 if len(readings) < 2:
122 continue
124 apparatus_verse_text = get_apparatus_verse_text(app)
126 console.print(f"Apparatus text: [blue]{apparatus_verse_text}[/blue]")
128 reading_texts = [extract_text(reading) for reading in readings]
129 reading_list = ", ".join([("⸂" + reading + "⸃") if reading else "⸂OMISSION⸃" for reading in reading_texts])
130 readings_string = readings_list_to_str([extract_text(reading) for reading in readings])
131 permutations = "\n".join([permutation.text for permutation in get_reading_permutations(apparatus, verse, witness=siglum, bracket_app=app, max_permutations=10, ignore_types=ignore)])
132 doc_corresponding_text = corresponding_text_chain.invoke(dict(
133 doc_verse_text=doc_verse_text,
134 permutations=permutations,
135 reading_list=reading_list
136 ))
138 console.print(f"Corresponding text: [blue]{doc_corresponding_text}[/blue]")
140 # find similar verses
141 similar_verses = set()
142 if doc_db:
143 doc_verse_text = doc_verse_text or ""
144 similar_verses.update(get_similar_verses_by_phrase(doc_db, doc_verse_text))
145 if doc_corresponding_text:
146 similar_verses.update(get_similar_verses_by_phrase(doc_db, doc_corresponding_text))
147 if apparatus_db:
148 for reading in readings:
149 similar_verses.update(get_similar_verses_by_phrase(apparatus_db, extract_text(reading)))
150 similar_verses.discard(verse)
152 similar_verse_examples = ""
153 if similar_verses:
154 similar_verse_examples = (
155 f"Here are {len(similar_verses)} similar texts to the one that you need to analyze. "
156 f"You will see the {doc_language} language text and then all potential {apparatus_language} source texts. "
157 f"Even though might not clear which {apparatus_language} was the actual source, consider the translation technique going from {apparatus_language} to {doc_language}.\n"
158 "See the way that the translator has translated particular words and gramatical constructions that are similar to the texts you need to analyze. \n\n"
159 )
160 for similar_verse in similar_verses:
161 example_doc_text = get_verse_text(doc, similar_verse)
162 similar_verse_permutations = get_reading_permutations(apparatus, similar_verse, witness=siglum, max_permutations=5, ignore_types=ignore)
163 similar_readings = readings_list_to_str([similar_verse_permutation.text for similar_verse_permutation in similar_verse_permutations])
164 similar_verse_examples += (
165 f"{doc_language} example {similar_verse}:\n{example_doc_text}\n"
166 f"Possible {apparatus_language} source(s):\n{similar_readings}\n\n"
167 )
168 similar_verse_examples += (
169 f"Here is the {doc_language} text to analyze:\n{doc_corresponding_text}\n[Full text in context: {doc_verse_text}]\n\n"
170 f"Here is the source {apparatus_language} text to analyze with the textual variant in brackets like this: ⸂ ⸃:\n{apparatus_verse_text}\n\n"
171 f"Here are the potential {apparatus_language} readings that go between the brackets that could be the source of '{doc_corresponding_text}':\n{readings_string}"
172 )
174 results, justification = source_chain.invoke(dict(
175 doc_verse_text=doc_verse_text,
176 doc_corresponding_text=doc_corresponding_text,
177 apparatus_verse_text=apparatus_verse_text,
178 readings=readings_string,
179 similar_verse_examples=similar_verse_examples,
180 ))
182 for index, reading in enumerate(readings):
183 reading_text = extract_text(reading)
184 if index in results:
185 console.print(f"[bold green]✓ {reading_text}")
186 add_witness_readings(reading, siglum)
187 else:
188 console.print(f"[grey62]𐄂 {reading_text}")
190 add_wit_detail(app, siglum, phrase=doc_corresponding_text, phrase_lang=doc_language_code, note=justification, resp_id=resp_id)
192 console.print(justification, style="blue")
194 # Write TEI XML output
195 print("Writing TEI XML output to", output)
196 write_tei(apparatus, output)
198 return apparatus
201@app.command()
202def doc_db(
203 doc: Path,
204 db:Path,
205):
206 """
207 Creates a database for the document.
208 """
209 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
210 doc_path = doc
211 doc = read_tei(doc_path)
212 db = get_teidoc_db(doc, model=embeddings_model, path=db)
213 return db
216@app.command()
217def apparatus_db(
218 apparatus: Path,
219 db:Path,
220):
221 """
222 Creates a database for the apparatus.
223 """
224 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
225 apparatus = read_tei(apparatus)
226 db = get_apparatus_db(apparatus, model=embeddings_model, path=db)
227 return db
230@app.command()
231def similar(
232 db:Path,
233 verse:str,
234 window:int=3,
235):
236 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
237 db = get_db(None, embeddings_model, db)
238 similar_verses = get_similar_verses(db, verse, window=window)
240 console.print(f"Similar verses to [bold red]{verse}[/bold red]:")
241 for similar_verse, doc in similar_verses.items():
242 console.print(f"[green]{similar_verse}[/green]: {doc.page_content}")
245@app.command()
246def evaluate(
247 apparatus:Path,
248 gold_siglum:str,
249 prediction_siglum:str,
250 false_positives:Path=None,
251 false_negatives:Path=None,
252):
253 apparatus = read_tei(apparatus)
254 readings = find_elements(apparatus, ".//rdg")
255 tp = sum(reading_has_witness(reading, gold_siglum) and reading_has_witness(reading, prediction_siglum) for reading in readings)
256 fp = sum(reading_has_witness(reading, prediction_siglum) and not reading_has_witness(reading, gold_siglum) for reading in readings)
257 fn = sum(reading_has_witness(reading, gold_siglum) and not reading_has_witness(reading, prediction_siglum) for reading in readings)
258 tn = sum(not reading_has_witness(reading, gold_siglum) and not reading_has_witness(reading, prediction_siglum) for reading in readings)
259 recall = tp / (tp + fn)
260 precision = tp / (tp + fp)
261 f1 = 2 * (precision * recall) / (precision + recall)
262 fpr = fp / (fp + tn)
263 fnr = fn / (fn + tp)
264 console.print(f"Recall: {recall:.1%}")
265 console.print(f"Precision: {precision:.1%}")
266 console.print(f"False Negative Rate: {fnr:.1%}")
267 console.print(f"False Positive Rate: {fpr:.1%}")
268 console.print(f"False Positives: {fp}")
269 console.print(f"False Negatives: {fn}")
270 console.print(f"True Positives: {tp}")
271 console.print(f"True Negatives: {tn}")
273 console.print(f"F1: {f1:.1%}")
275 if false_positives:
276 fp_readings = [reading for reading in readings if reading_has_witness(reading, prediction_siglum) and not reading_has_witness(reading, gold_siglum)]
277 abs = set(find_parent(reading, "ab") for reading in fp_readings)
278 console.print(f"Writing {len(fp_readings)} false positives to {false_positives}")
279 write_elements(abs, false_positives, "listApp", type="false-positives")
281 if false_negatives:
282 fn_readings = [reading for reading in readings if reading_has_witness(reading, gold_siglum) and not reading_has_witness(reading, prediction_siglum)]
283 abs = set(find_parent(reading, "ab") for reading in fn_readings)
284 console.print(f"Writing {len(fn_readings)} false negatives to {false_negatives}")
285 write_elements(abs, false_negatives, "listApp", type="false-negatives")
288@app.command()
289def agreements(
290 apparatus:Path,
291 siglum1:str,
292 siglum2:str,
293 horizontal:bool=False,
294):
295 apparatus = read_tei(apparatus)
296 counter = count_witness_agreements(apparatus, siglum1, siglum2)
298 # results = [
299 # counter[WitnessComparison.UNAMBIGUOUS_AGREEMENT],
300 # counter[WitnessComparison.AMBIGUOUS_AGREEMENT],
301 # counter[WitnessComparison.UNAMBIGUOUS_DISAGREEMENT],
302 # counter[WitnessComparison.MISSING],
303 # ]
304 if horizontal:
305 print("siglum1", "siglum2", "\t".join([category.plural for category in WitnessComparison]), sep="\t")
306 print(siglum1, siglum2, "\t".join([str(counter[category]) for category in WitnessComparison]), sep="\t")
307 else:
308 print("siglum1", siglum1, sep="\t")
309 print("siglum2", siglum2, sep="\t")
310 for category in WitnessComparison:
311 print(category.plural, counter[category], sep="\t")
314@app.command()
315def ensemble(siglum:str, output:Path, apparatuses:list[Path]):
316 apparatuses = [read_tei(apparatus) for apparatus in apparatuses]
317 result = do_ensemble(apparatuses, siglum)
318 print(f"Writing ensemble to {output}")
319 write_tei(result, output)