Coverage for vorlagellm/tei.py: 87.59%

282 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-10-24 03:22 +0000

1from pathlib import Path 

2from lxml import etree as ET 

3from lxml.etree import _ElementTree as ElementTree 

4from lxml.etree import _Element as Element 

5from lxml.etree import Element as new_element 

6from lxml.etree import ElementTree as new_element_tree 

7import re 

8from dataclasses import dataclass 

9from datetime import datetime 

10import copy 

11 

12from .languages import convert_language_code 

13 

14 

15@dataclass 

16class Permutation: 

17 text:str 

18 readings:list[Element] 

19 apps:list[Element]=None 

20 

21 

22def read_tei(path:Path) -> ElementTree: 

23 parser = ET.XMLParser(remove_blank_text=True) 

24 with open(path, 'r') as f: 

25 return ET.parse(f, parser) 

26 

27 

28def find_element(doc:ElementTree|Element, xpath:str) -> Element|None: 

29 if isinstance(doc, ElementTree): 

30 doc = doc.getroot() 

31 element = doc.find(xpath, namespaces=doc.nsmap) 

32 if element is None: 

33 element = doc.find(xpath) 

34 return element 

35 

36 

37def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None: 

38 if isinstance(doc, ElementTree): 

39 doc = doc.getroot() 

40 return doc.findall(xpath, namespaces=doc.nsmap) 

41 

42 

43def get_siglum(doc:ElementTree|Element) -> str: 

44 """ 

45 Get the attribute of the 'n' attribute in the <title type="document"> element. 

46 

47 Returns an empty string if the element is not found. 

48 """ 

49 title = find_element(doc, ".//title[@type='document']") 

50 if title is None: 

51 return "" 

52 

53 return title.attrib.get('n', "") 

54 

55 

56def get_language_code(doc:ElementTree|Element) -> str: 

57 """ Reads the element <text> and returns the value of the xml:lang attribute.""" 

58 text = find_element(doc, ".//text") 

59 if text is None: 

60 return "" 

61 

62 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "") 

63 

64 

65def get_language(doc:ElementTree|Element) -> str: 

66 code = get_language_code(doc) 

67 return convert_language_code(code) 

68 

69 

70def app_has_witness(app:Element, siglum:str) -> bool: 

71 """ Returns True if the apparatus has a <rdg> element with the specified siglum.""" 

72 readings = find_elements(app, ".//rdg") 

73 return any(reading_has_witness(reading, siglum) for reading in readings) 

74 

75 

76def get_verses(doc:ElementTree|Element) -> list[str]: 

77 """ Returns a list of "n" attributes in <ab> elements.""" 

78 ab_elements = find_elements(doc, ".//ab") 

79 return [ab.attrib['n'] for ab in ab_elements if 'n' in ab.attrib] 

80 

81 

82def find_readings(element, ignore_types:list[str]|None) -> list[Element]: 

83 readings = find_elements(element, ".//rdg") 

84 if ignore_types: 

85 readings = [reading for reading in readings if reading.attrib.get("type", "") not in ignore_types] 

86 return readings 

87 

88 

89def get_reading_permutations( 

90 apparatus:ElementTree|Element, 

91 verse:str, 

92 witness:str="", 

93 bracket_app:Element|None=None, 

94 ignore_types:list[str]|None=None, 

95 max_permutations:int=0, 

96) -> list[Permutation]: 

97 verse_element = get_verse_element(apparatus, verse) 

98 if verse_element is None: 

99 return [] 

100 

101 permutations = [Permutation(text="", readings=[])] 

102 

103 apps = [] 

104 for child in verse_element.getchildren(): 

105 if not isinstance(child.tag, str): 

106 continue 

107 

108 tag = re.sub(r"\{.*\}", "", child.tag) 

109 if tag == "app": 

110 has_witness = bool(witness) and app_has_witness(child, witness) 

111 

112 apps.append(child) 

113 new_permutations = [] 

114 readings = find_readings(child, ignore_types=ignore_types) 

115 for reading in readings: 

116 if has_witness and not reading_has_witness(reading, witness): 

117 continue 

118 

119 reading_text = extract_text(reading) or "" 

120 if bracket_app is not None and bracket_app == child: 

121 reading_text = f"⸂{reading_text}⸃" 

122 

123 for permutation in permutations: 

124 new_permutation = Permutation(text=permutation.text + " " + reading_text, readings=permutation.readings + [reading]) 

125 new_permutations.append(new_permutation) 

126 permutations = new_permutations 

127 else: 

128 for permutation in permutations: 

129 permutation.text = (permutation.text + " " + extract_text(child)).strip() 

130 

131 def clean_text(text:str) -> str: 

132 return re.sub(r"\s+", " ", text.strip()) 

133 

134 perumutations = [Permutation(text=clean_text(permutation.text), readings=permutation.readings, apps=apps) for permutation in permutations] 

135 

136 if max_permutations and len(perumutations) > max_permutations: 

137 return [perumutations[index] for index in range(0, len(perumutations), len(perumutations)//max_permutations)] 

138 # import kmedoids 

139 # import numpy as np 

140 # import Levenshtein 

141 

142 # distance_matrix = np.zeros((len(perumutations), len(perumutations))) 

143 # for index1, permutation1 in enumerate(perumutations): 

144 # for index2 in range(index1+1, len(perumutations)): 

145 # permutation2 = perumutations[index2] 

146 # distance = Levenshtein.distance(permutation1.text, permutation2.text) 

147 # distance_matrix[index1, index2] = distance 

148 # distance_matrix[index2, index1] = distance 

149 

150 # result = kmedoids.fasterpam(distance_matrix, max_permutations, random_state=random_state, init="build") 

151 

152 # permutations = [perumutations[index] for index in result.medoids] 

153 

154 return permutations 

155 

156 

157 

158# def extract_text(node:Element, include_tail:bool=True) -> str: 

159# text = node.text or "" 

160# for child in node: 

161# if isinstance(child.tag, str): 

162# tag = re.sub(r"{.*}", "", child.tag) 

163# else: 

164# continue 

165 

166# if tag in ["pc", "witDetail", "note"]: 

167# continue 

168# if tag == "app":  

169# breakpoint() 

170# lemma = find_element(child, ".//lem") 

171# if lemma is None: 

172# lemma = find_element(child, ".//rdg") 

173# text += extract_text(lemma) or "" 

174# text += " " 

175# elif tag == "ref": 

176# root = child.getroottree().getroot() 

177# target_id = child.attrib['target'].lstrip("#") 

178# ns = {"tei": "http://www.tei-c.org/ns/1.0"} 

179# target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns) 

180 

181# breakpoint() 

182# child_text = extract_text(target[0]) if target else extract_text(child) 

183# text += child_text or "" 

184# else: 

185# text += extract_text(child) or "" 

186 

187# if tag == "w": 

188# text += " " 

189 

190# if include_tail: 

191# text += node.tail or "" 

192 

193# return text 

194 

195 

196def get_verse_element(doc:ElementTree|Element, verse:str) -> Element|None: 

197 return find_element(doc, f".//ab[@n='{verse}']") 

198 

199 

200def get_verse_text(doc:ElementTree|Element, verse:str) -> str|None: 

201 verse_element = get_verse_element(doc, verse) 

202 if verse_element is None: 

203 return None 

204 

205 return extract_text(verse_element).strip() 

206 

207 

208def add_witness_readings( readings:Element|list[Element], siglum:str) -> None: 

209 if isinstance(readings, Element): 

210 readings = [readings] 

211 

212 for reading in readings: 

213 if 'wit' not in reading.attrib: 

214 reading.attrib['wit'] = "" 

215 

216 if reading_has_witness(reading, siglum): 

217 continue 

218 

219 if not siglum.startswith("#"): 

220 siglum = "#" + siglum 

221 reading.attrib['wit'] += f" {siglum}" 

222 reading.attrib['wit'] = reading.attrib['wit'].strip() 

223 

224 

225def remove_witnesss_readings(readings:Element|list[Element], siglum:str) -> None: 

226 if isinstance(readings, Element): 

227 readings = [readings] 

228 

229 for reading in readings: 

230 if not reading_has_witness(reading, siglum): 

231 continue 

232 

233 witnesses = reading.attrib['wit'].split() 

234 witnesses = [witness for witness in witnesses if witness != siglum and witness != f"#{siglum}"] 

235 reading.attrib['wit'] = " ".join(witnesses) 

236 

237 

238def write_tei(doc:ElementTree, path:Path|str) -> None: 

239 Path(path).parent.mkdir(parents=True, exist_ok=True) 

240 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True) 

241 

242 

243def get_witness_list(apparatus:ElementTree|Element) -> Element: 

244 list_wit = find_element(apparatus, ".//listWit") 

245 if list_wit is None: 

246 raise ValueError("Could not find <listWit> element in the apparatus.") 

247 

248 return list_wit 

249 

250 

251def add_siglum(apparatus:ElementTree|Element, siglum:str) -> Element: 

252 if isinstance(apparatus, ElementTree): 

253 apparatus = apparatus.getroot() 

254 

255 """ Adds a <witness> element to the <listWit> element in the apparatus.""" 

256 list_wit = get_witness_list(apparatus) 

257 

258 # Check if the witness already exists 

259 witness_element = find_element(list_wit, f".//witness[@n='{siglum}']") 

260 if not witness_element: 

261 witness_element = ET.Element("witness", attrib={"n": siglum}) 

262 list_wit.append(witness_element) 

263 

264 return witness_element 

265 

266 

267def has_witness(apparatus:ElementTree|Element, siglum:str) -> bool: 

268 list_wit = get_witness_list(apparatus) 

269 return find_element(list_wit, f".//witness[@n='{siglum}']") is not None 

270 

271 

272def reading_has_witness(reading:Element, siglum:str) -> bool: 

273 if 'wit' not in reading.attrib: 

274 return False 

275 

276 witnesses = reading.attrib['wit'].split() 

277 return (siglum in witnesses or f"#{siglum}" in witnesses) 

278 

279 

280def add_wit_detail(apps:Element|set[Element], siglum:str, note:str="", phrase:str="", phrase_lang:str="", resp_id:str="VorlageLLM") -> None: 

281 if isinstance(apps, Element): 

282 apps = [apps] 

283 for app in apps: 

284 wit_detail = ET.SubElement(app, "witDetail", wit=siglum, resp=f"#{resp_id}") 

285 if phrase: 

286 phrase_element = ET.SubElement(wit_detail, "phr") 

287 phrase_element.text = phrase 

288 if phrase_lang: 

289 phrase_element.attrib['{http://www.w3.org/XML/1998/namespace}lang'] = phrase_lang 

290 if note: 

291 ET.SubElement(wit_detail, "note").text = note 

292 

293 

294def find_parent(element:Element, tag:str) -> Element|None: 

295 """ 

296 Finds the nearest ancestor of the given element with the specified tag. 

297 

298 Args: 

299 element (Element): The starting XML element from which to search upward. 

300 tag (str): The tag name of the ancestor element to find. 

301 

302 Returns: 

303 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found. 

304 

305 Example: 

306 >>> from xml.etree.ElementTree import Element 

307 >>> root = Element('root') 

308 >>> ab = Element('ab') 

309 >>> section = Element('section') 

310 >>> target = Element('target') 

311 >>> root.append(ab) 

312 >>> ab.append(section) 

313 >>> section.append(target) 

314 >>> result = find_parent(target, 'ab') 

315 >>> assert result == ab 

316 

317 This will find the <ab> ancestor of the <target> element. 

318 """ 

319 while element is not None: 

320 element_tag = re.sub(r"{.*}", "", element.tag) 

321 if element_tag == tag: 

322 return element 

323 element = element.getparent() 

324 return None 

325 

326 

327def strip_namespace(element: Element) -> Element: 

328 """Remove namespace from an element and its children.""" 

329 element.tag = element.tag.split('}', 1)[-1] # Remove namespace 

330 for elem in element.iter(): 

331 if isinstance(elem, str): 

332 elem.tag = elem.tag.split('}', 1)[-1] # Remove namespace 

333 return element 

334 

335 

336def write_elements(elements:list[Element], output_file:Path, root_tag:str="body", **kwargs) -> None: 

337 """ 

338 Writes a list of XML elements to a file, wrapping them in a specified root element. 

339 

340 Args: 

341 elements (list[Element]): List of XML elements to be written. 

342 output_file (Path): Path object specifying the file where the XML will be written. 

343 root_tag (str): Tag name for the root element that will wrap the elements. Defaults to "body". 

344 

345 Returns: 

346 None: This function does not return any value. It writes the XML structure to the specified file. 

347 """ 

348 root = new_element(root_tag, **kwargs) 

349 for element in elements: 

350 if element is not None: 

351 root.append(strip_namespace(element)) 

352 

353 tree = new_element_tree(root) 

354 write_tei(tree, output_file) 

355 

356 

357def get_apparatus_verse_text(app:Element, witness:str="") -> str: 

358 parent = find_parent(app, 'ab') 

359 text = parent.text or "" 

360 text = text.strip() 

361 text += " " 

362 for child in parent: 

363 if isinstance(child.tag, str): 

364 tag = re.sub(r"{.*}", "", child.tag) 

365 else: 

366 continue 

367 

368 if tag in ["pc", "witDetail", "note"]: 

369 continue 

370 

371 if tag == "app": 

372 if witness and app_has_witness(child, witness): 

373 for reading in find_elements(child, ".//rdg"): 

374 if reading_has_witness(reading, witness): 

375 lemma = reading 

376 break 

377 else: 

378 lemma = find_element(child, ".//lem") 

379 if lemma is None: 

380 lemma = find_element(child, ".//rdg") 

381 if lemma is None: 

382 lemma = app 

383 app_text = extract_text(lemma, include_tail=False) or "" 

384 app_text = app_text.strip() 

385 if child == app: 

386 app_text = f"⸂{app_text}⸃" 

387 text += app_text 

388 if app.tail: 

389 text += " " + app.tail.strip() 

390 else: 

391 child_text = extract_text(child) or "" 

392 child_text = child_text.strip() 

393 text += child_text or "" 

394 

395 text += " " 

396 

397 text += parent.tail or "" 

398 text = re.sub(r"\s+", " ", text.strip()) 

399 return text 

400 

401 

402def extract_text(node:Element, include_tail:bool=True, strip:bool=True) -> str: 

403 if node is None: 

404 return "" 

405 

406 if isinstance(node.tag, str): 

407 tag = re.sub(r"{.*}", "", node.tag) 

408 else: 

409 return "" 

410 

411 if tag in ["pc", "witDetail", "note"]: 

412 return "" 

413 if tag == "app": 

414 lemma = find_element(node, ".//lem") 

415 if lemma is None: 

416 lemma = find_element(node, ".//rdg") 

417 if lemma: 

418 return extract_text(lemma, strip=False) or "" 

419 if tag == "ref": 

420 root = node.getroottree().getroot() 

421 target_id = node.attrib['target'].lstrip("#") 

422 ns = {"tei": "http://www.tei-c.org/ns/1.0"} 

423 target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns) 

424 

425 if target: 

426 return extract_text(target[0], strip=strip) 

427 

428 text = node.text or "" 

429 for child in node: 

430 text += extract_text(child, strip=False) 

431 

432 if include_tail and node.tail: 

433 text += node.tail 

434 

435 text = re.sub(r"\s+", " ", text) 

436 

437 if tag == "w" or (tag == "lb" and node.attrib.get("break", "").lower() != "no"): 

438 text += " " 

439 

440 if strip: 

441 text = text.strip() 

442 

443 return text 

444 

445 

446def readings_for_witness(app:Element, siglum:str) -> set[Element]: 

447 """ 

448 Collects readings associated with a specific witness from an XML apparatus entry. 

449 

450 Args: 

451 app (Element): The XML element representing the apparatus entry. 

452 siglum (str): The siglum of the witness to filter readings by. 

453 

454 Returns: 

455 set[Element]: A set of reading elements that include the specified witness. 

456 """ 

457 readings = find_elements(app, ".//rdg") 

458 return set(reading for reading in readings if reading_has_witness(reading, siglum)) 

459 

460 

461def add_doc_metadata(witness_element:Element, doc:ElementTree) -> Element: 

462 """ 

463 Adds metadata from the `doc` element tree to the `witness_element`. 

464 

465 This function searches for a `biblFull` element within the given `witness_element`. 

466 If it does not exist, a new `biblFull` element is created. It then copies all children 

467 from the `fileDesc` element in the `doc` element tree and appends them to the `biblFull` element. 

468 

469 Args: 

470 witness_element (Element): The XML element representing the witness to which metadata will be added. 

471 doc (ElementTree): The XML element tree containing the source metadata. 

472 

473 Returns: 

474 Element: The `biblFull` element within the `witness_element` containing the appended metadata. 

475 """ 

476 bibl_full = find_element(witness_element, ".//biblFull") 

477 if bibl_full is None: 

478 bibl_full = ET.SubElement(witness_element, "biblFull") 

479 file_description = find_element(doc, ".//fileDesc") 

480 if file_description is not None: 

481 for child in file_description: 

482 new_child = copy.deepcopy(child) 

483 bibl_full.append(new_child) 

484 return bibl_full 

485 

486 

487def add_responsibility_statement(doc:ElementTree, xml_id:str, description:str) -> tuple[Element,str]: 

488 """ 

489 Adds a responsibility statement to the XML document. 

490 

491 Args: 

492 doc (ElementTree): The XML document to which the responsibility statement will be added. 

493 siglum (str): The siglum of the witness. 

494 description (str): The description of the responsibility statement. 

495 

496 Returns: 

497 Element: The responsibility statement element that was added to the document. 

498 str: The unique ID of the responsibility statement. 

499 """ 

500 if isinstance(doc, ElementTree): 

501 doc = doc.getroot() 

502 

503 header = find_element(doc, ".//teiHeader") 

504 if header is None: 

505 header = ET.SubElement(doc, "teiHeader") 

506 

507 file_description = find_element(header, ".//fileDesc") 

508 if file_description is None: 

509 file_description = ET.SubElement(header, "fileDesc") 

510 

511 title_statement = find_element(file_description, ".//titleStmt") 

512 if title_statement is None: 

513 title_statement = ET.SubElement(file_description, "titleStmt") 

514 

515 # Get unique ID 

516 counter = 1 

517 while find_element(title_statement, f".//respStmt[@{{http://www.w3.org/XML/1998/namespace}}id='{xml_id}']") is not None: 

518 counter += 1 

519 xml_id = f"VorlageLLM-{counter}" 

520 

521 responsibility_statement = ET.SubElement(title_statement, "respStmt", ) 

522 responsibility_statement.attrib['{http://www.w3.org/XML/1998/namespace}id'] = xml_id 

523 

524 # Get datetime in required format 

525 current_time = datetime.now() 

526 formatted_time = current_time.strftime('%Y-%m-%dT%H:%M:%S') 

527 

528 resp = ET.SubElement(responsibility_statement, "resp", when=formatted_time) 

529 resp.text = description 

530 

531 return responsibility_statement, xml_id 

532 

533 

534def add_responsibility_statement_llm(doc:ElementTree, siglum:str, model_id:str) -> tuple[Element,str]: 

535 """ 

536 Adds a responsibility statement to the XML document. 

537 

538 Args: 

539 doc (ElementTree): The XML document to which the responsibility statement will be added. 

540 siglum (str): The siglum of the witness. 

541 model_id (str): The ID of the LLM model used. 

542 

543 Returns: 

544 Element: The responsibility statement element that was added to the document. 

545 str: The unique ID of the responsibility statement. 

546 """ 

547 description = f"Witness '{siglum}' added using VorlageLLM using LLM '{model_id}'" 

548 xml_id = f"VorlageLLM-{siglum}-{model_id}" 

549 return add_responsibility_statement(doc, xml_id, description)