Coverage for vorlagellm/main.py: 52.98%

168 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-10-24 03:22 +0000

1import typer 

2from typing_extensions import Annotated 

3from pathlib import Path 

4from rich.progress import track 

5from rich.console import Console 

6from langchain_openai import OpenAIEmbeddings 

7import llmloader 

8 

9from .chains import build_corresponding_text_chain, build_source_chain 

10from .prompts import readings_list_to_str 

11from .rag import get_apparatus_db, get_teidoc_db, get_db, get_similar_verses, get_similar_verses_by_phrase 

12from .agreements import count_witness_agreements, WitnessComparison 

13from vorlagellm.tei import ( 

14 read_tei, 

15 get_siglum, 

16 add_siglum, 

17 get_language, 

18 get_verses, 

19 get_reading_permutations, 

20 find_readings, 

21 get_verse_text, 

22 add_doc_metadata, 

23 add_witness_readings, 

24 write_tei, 

25 add_wit_detail, 

26 find_elements, 

27 get_language_code, 

28 get_verse_element, 

29 add_responsibility_statement_llm, 

30 extract_text, 

31 reading_has_witness, 

32 get_apparatus_verse_text, 

33 write_elements, 

34 find_parent, 

35 app_has_witness, 

36) 

37from .ensemble import do_ensemble 

38 

39console = Console() 

40 

41app = typer.Typer() 

42 

43DEFAULT_MODEL_ID = "gpt-4.1" 

44DEFAULT_EMBEDDING_MODEL_ID = "text-embedding-3-large" 

45 

46 

47@app.command() 

48def run( 

49 doc: Path, 

50 apparatus: Path, 

51 output:Path, 

52 api_key:str="", 

53 model:str=DEFAULT_MODEL_ID, 

54 apparatus_db:Path=None, 

55 doc_db:Path=None, 

56 siglum:str="", 

57 notes:Path=None, 

58 include:list[str]=None, 

59 ignore:list[str]=None, 

60 initiate_response:bool=False, 

61): 

62 """ Runs the main VorlageLLM pipeline on a document to predict which source readings from an apparatus could have produced its text. """ 

63 llm = llmloader.load(model=model, api_key=api_key) 

64 doc_path = doc 

65 doc = read_tei(doc_path) 

66 apparatus_path = apparatus 

67 apparatus = read_tei(apparatus_path) 

68 

69 # Add as witness to apparatus 

70 siglum = siglum or get_siglum(doc) 

71 assert siglum, f"Could not determine siglum in '{doc_path}'. Please add a siglum to the TEI XML or add a siglum in the command line with --siglum" 

72 witness_element = add_siglum(apparatus, siglum) 

73 

74 if notes and Path(notes).exists(): 

75 notes = Path(notes).read_text() 

76 else: 

77 notes = "" 

78 

79 # Add responsibility statement 

80 _, resp_id = add_responsibility_statement_llm(apparatus, siglum, model) 

81 

82 # Add metadata to apparatus 

83 add_doc_metadata(witness_element, doc) 

84 

85 # Get languages 

86 doc_language = get_language(doc) 

87 doc_language_code = get_language_code(doc) 

88 assert doc_language, f"Could not determine language of document {doc_path}" 

89 

90 apparatus_language = get_language(apparatus) 

91 assert apparatus_language, f"Could not determine language of apparatus {apparatus_path}" 

92 

93 # Create database for apparatus 

94 if doc_db: 

95 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID) 

96 doc_db = get_teidoc_db(doc, model=embeddings_model, path=doc_db) 

97 

98 if apparatus_db: 

99 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID) 

100 apparatus_db = get_apparatus_db(apparatus, model=embeddings_model, path=apparatus_db, ignore_types=ignore) 

101 

102 # Create chain to use 

103 corresponding_text_chain = build_corresponding_text_chain(llm, doc_language=doc_language, apparatus_language=apparatus_language, initiate_response=initiate_response) 

104 source_chain = build_source_chain(llm, doc_language=doc_language, apparatus_language=apparatus_language, notes=notes, initiate_response=initiate_response) 

105 

106 verses = get_verses(apparatus) 

107 if include: 

108 verses = [v for v in verses if v in include] 

109 

110 for verse in verses: 

111 doc_verse_text = get_verse_text(doc, verse) 

112 console.rule(f"Verse '{verse}'", style="bold red") 

113 console.print(f"Text: {doc_verse_text}") 

114 apparatus_verse_element = get_verse_element(apparatus, verse) 

115 

116 for app in find_elements(apparatus_verse_element, ".//app"): 

117 if app_has_witness(app, siglum): 

118 continue 

119 

120 readings = find_readings(app, ignore_types=ignore) 

121 if len(readings) < 2: 

122 continue 

123 

124 apparatus_verse_text = get_apparatus_verse_text(app) 

125 

126 console.print(f"Apparatus text: [blue]{apparatus_verse_text}[/blue]") 

127 

128 reading_texts = [extract_text(reading) for reading in readings] 

129 reading_list = ", ".join([("⸂" + reading + "⸃") if reading else "⸂OMISSION⸃" for reading in reading_texts]) 

130 readings_string = readings_list_to_str([extract_text(reading) for reading in readings]) 

131 permutations = "\n".join([permutation.text for permutation in get_reading_permutations(apparatus, verse, witness=siglum, bracket_app=app, max_permutations=10, ignore_types=ignore)]) 

132 doc_corresponding_text = corresponding_text_chain.invoke(dict( 

133 doc_verse_text=doc_verse_text, 

134 permutations=permutations, 

135 reading_list=reading_list 

136 )) 

137 

138 console.print(f"Corresponding text: [blue]{doc_corresponding_text}[/blue]") 

139 

140 # find similar verses 

141 similar_verses = set() 

142 if doc_db: 

143 doc_verse_text = doc_verse_text or "" 

144 similar_verses.update(get_similar_verses_by_phrase(doc_db, doc_verse_text)) 

145 if doc_corresponding_text: 

146 similar_verses.update(get_similar_verses_by_phrase(doc_db, doc_corresponding_text)) 

147 if apparatus_db: 

148 for reading in readings: 

149 similar_verses.update(get_similar_verses_by_phrase(apparatus_db, extract_text(reading))) 

150 similar_verses.discard(verse) 

151 

152 similar_verse_examples = "" 

153 if similar_verses: 

154 similar_verse_examples = ( 

155 f"Here are {len(similar_verses)} similar texts to the one that you need to analyze. " 

156 f"You will see the {doc_language} language text and then all potential {apparatus_language} source texts. " 

157 f"Even though might not clear which {apparatus_language} was the actual source, consider the translation technique going from {apparatus_language} to {doc_language}.\n" 

158 "See the way that the translator has translated particular words and gramatical constructions that are similar to the texts you need to analyze. \n\n" 

159 ) 

160 for similar_verse in similar_verses: 

161 example_doc_text = get_verse_text(doc, similar_verse) 

162 similar_verse_permutations = get_reading_permutations(apparatus, similar_verse, witness=siglum, max_permutations=5, ignore_types=ignore) 

163 similar_readings = readings_list_to_str([similar_verse_permutation.text for similar_verse_permutation in similar_verse_permutations]) 

164 similar_verse_examples += ( 

165 f"{doc_language} example {similar_verse}:\n{example_doc_text}\n" 

166 f"Possible {apparatus_language} source(s):\n{similar_readings}\n\n" 

167 ) 

168 similar_verse_examples += ( 

169 f"Here is the {doc_language} text to analyze:\n{doc_corresponding_text}\n[Full text in context: {doc_verse_text}]\n\n" 

170 f"Here is the source {apparatus_language} text to analyze with the textual variant in brackets like this: ⸂ ⸃:\n{apparatus_verse_text}\n\n" 

171 f"Here are the potential {apparatus_language} readings that go between the brackets that could be the source of '{doc_corresponding_text}':\n{readings_string}" 

172 ) 

173 

174 results, justification = source_chain.invoke(dict( 

175 doc_verse_text=doc_verse_text, 

176 doc_corresponding_text=doc_corresponding_text, 

177 apparatus_verse_text=apparatus_verse_text, 

178 readings=readings_string, 

179 similar_verse_examples=similar_verse_examples, 

180 )) 

181 

182 for index, reading in enumerate(readings): 

183 reading_text = extract_text(reading) 

184 if index in results: 

185 console.print(f"[bold green]✓ {reading_text}") 

186 add_witness_readings(reading, siglum) 

187 else: 

188 console.print(f"[grey62]𐄂 {reading_text}") 

189 

190 add_wit_detail(app, siglum, phrase=doc_corresponding_text, phrase_lang=doc_language_code, note=justification, resp_id=resp_id) 

191 

192 console.print(justification, style="blue") 

193 

194 # Write TEI XML output 

195 print("Writing TEI XML output to", output) 

196 write_tei(apparatus, output) 

197 

198 return apparatus 

199 

200 

201@app.command() 

202def doc_db( 

203 doc: Path, 

204 db:Path, 

205): 

206 """ 

207 Creates a database for the document. 

208 """ 

209 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID) 

210 doc_path = doc 

211 doc = read_tei(doc_path) 

212 db = get_teidoc_db(doc, model=embeddings_model, path=db) 

213 return db 

214 

215 

216@app.command() 

217def apparatus_db( 

218 apparatus: Path, 

219 db:Path, 

220): 

221 """ 

222 Creates a database for the apparatus. 

223 """ 

224 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID) 

225 apparatus = read_tei(apparatus) 

226 db = get_apparatus_db(apparatus, model=embeddings_model, path=db) 

227 return db 

228 

229 

230@app.command() 

231def similar( 

232 db:Path, 

233 verse:str, 

234 window:int=3, 

235): 

236 embeddings_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID) 

237 db = get_db(None, embeddings_model, db) 

238 similar_verses = get_similar_verses(db, verse, window=window) 

239 

240 console.print(f"Similar verses to [bold red]{verse}[/bold red]:") 

241 for similar_verse, doc in similar_verses.items(): 

242 console.print(f"[green]{similar_verse}[/green]: {doc.page_content}") 

243 

244 

245@app.command() 

246def evaluate( 

247 apparatus:Path, 

248 gold_siglum:str, 

249 prediction_siglum:str, 

250 false_positives:Path=None, 

251 false_negatives:Path=None, 

252): 

253 apparatus = read_tei(apparatus) 

254 readings = find_elements(apparatus, ".//rdg") 

255 tp = sum(reading_has_witness(reading, gold_siglum) and reading_has_witness(reading, prediction_siglum) for reading in readings) 

256 fp = sum(reading_has_witness(reading, prediction_siglum) and not reading_has_witness(reading, gold_siglum) for reading in readings) 

257 fn = sum(reading_has_witness(reading, gold_siglum) and not reading_has_witness(reading, prediction_siglum) for reading in readings) 

258 tn = sum(not reading_has_witness(reading, gold_siglum) and not reading_has_witness(reading, prediction_siglum) for reading in readings) 

259 recall = tp / (tp + fn) 

260 precision = tp / (tp + fp) 

261 f1 = 2 * (precision * recall) / (precision + recall) 

262 fpr = fp / (fp + tn) 

263 fnr = fn / (fn + tp) 

264 console.print(f"Recall: {recall:.1%}") 

265 console.print(f"Precision: {precision:.1%}") 

266 console.print(f"False Negative Rate: {fnr:.1%}") 

267 console.print(f"False Positive Rate: {fpr:.1%}") 

268 console.print(f"False Positives: {fp}") 

269 console.print(f"False Negatives: {fn}") 

270 console.print(f"True Positives: {tp}") 

271 console.print(f"True Negatives: {tn}") 

272 

273 console.print(f"F1: {f1:.1%}") 

274 

275 if false_positives: 

276 fp_readings = [reading for reading in readings if reading_has_witness(reading, prediction_siglum) and not reading_has_witness(reading, gold_siglum)] 

277 abs = set(find_parent(reading, "ab") for reading in fp_readings) 

278 console.print(f"Writing {len(fp_readings)} false positives to {false_positives}") 

279 write_elements(abs, false_positives, "listApp", type="false-positives") 

280 

281 if false_negatives: 

282 fn_readings = [reading for reading in readings if reading_has_witness(reading, gold_siglum) and not reading_has_witness(reading, prediction_siglum)] 

283 abs = set(find_parent(reading, "ab") for reading in fn_readings) 

284 console.print(f"Writing {len(fn_readings)} false negatives to {false_negatives}") 

285 write_elements(abs, false_negatives, "listApp", type="false-negatives") 

286 

287 

288@app.command() 

289def agreements( 

290 apparatus:Path, 

291 siglum1:str, 

292 siglum2:str, 

293 horizontal:bool=False, 

294): 

295 apparatus = read_tei(apparatus) 

296 counter = count_witness_agreements(apparatus, siglum1, siglum2) 

297 

298 # results = [ 

299 # counter[WitnessComparison.UNAMBIGUOUS_AGREEMENT], 

300 # counter[WitnessComparison.AMBIGUOUS_AGREEMENT], 

301 # counter[WitnessComparison.UNAMBIGUOUS_DISAGREEMENT], 

302 # counter[WitnessComparison.MISSING], 

303 # ] 

304 if horizontal: 

305 print("siglum1", "siglum2", "\t".join([category.plural for category in WitnessComparison]), sep="\t") 

306 print(siglum1, siglum2, "\t".join([str(counter[category]) for category in WitnessComparison]), sep="\t") 

307 else: 

308 print("siglum1", siglum1, sep="\t") 

309 print("siglum2", siglum2, sep="\t") 

310 for category in WitnessComparison: 

311 print(category.plural, counter[category], sep="\t") 

312 

313 

314@app.command() 

315def ensemble(siglum:str, output:Path, apparatuses:list[Path]): 

316 apparatuses = [read_tei(apparatus) for apparatus in apparatuses] 

317 result = do_ensemble(apparatuses, siglum) 

318 print(f"Writing ensemble to {output}") 

319 write_tei(result, output)