Coverage for vorlagellm/tei.py: 87.59%
282 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
1from pathlib import Path
2from lxml import etree as ET
3from lxml.etree import _ElementTree as ElementTree
4from lxml.etree import _Element as Element
5from lxml.etree import Element as new_element
6from lxml.etree import ElementTree as new_element_tree
7import re
8from dataclasses import dataclass
9from datetime import datetime
10import copy
12from .languages import convert_language_code
15@dataclass
16class Permutation:
17 text:str
18 readings:list[Element]
19 apps:list[Element]=None
22def read_tei(path:Path) -> ElementTree:
23 parser = ET.XMLParser(remove_blank_text=True)
24 with open(path, 'r') as f:
25 return ET.parse(f, parser)
28def find_element(doc:ElementTree|Element, xpath:str) -> Element|None:
29 if isinstance(doc, ElementTree):
30 doc = doc.getroot()
31 element = doc.find(xpath, namespaces=doc.nsmap)
32 if element is None:
33 element = doc.find(xpath)
34 return element
37def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None:
38 if isinstance(doc, ElementTree):
39 doc = doc.getroot()
40 return doc.findall(xpath, namespaces=doc.nsmap)
43def get_siglum(doc:ElementTree|Element) -> str:
44 """
45 Get the attribute of the 'n' attribute in the <title type="document"> element.
47 Returns an empty string if the element is not found.
48 """
49 title = find_element(doc, ".//title[@type='document']")
50 if title is None:
51 return ""
53 return title.attrib.get('n', "")
56def get_language_code(doc:ElementTree|Element) -> str:
57 """ Reads the element <text> and returns the value of the xml:lang attribute."""
58 text = find_element(doc, ".//text")
59 if text is None:
60 return ""
62 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "")
65def get_language(doc:ElementTree|Element) -> str:
66 code = get_language_code(doc)
67 return convert_language_code(code)
70def app_has_witness(app:Element, siglum:str) -> bool:
71 """ Returns True if the apparatus has a <rdg> element with the specified siglum."""
72 readings = find_elements(app, ".//rdg")
73 return any(reading_has_witness(reading, siglum) for reading in readings)
76def get_verses(doc:ElementTree|Element) -> list[str]:
77 """ Returns a list of "n" attributes in <ab> elements."""
78 ab_elements = find_elements(doc, ".//ab")
79 return [ab.attrib['n'] for ab in ab_elements if 'n' in ab.attrib]
82def find_readings(element, ignore_types:list[str]|None) -> list[Element]:
83 readings = find_elements(element, ".//rdg")
84 if ignore_types:
85 readings = [reading for reading in readings if reading.attrib.get("type", "") not in ignore_types]
86 return readings
89def get_reading_permutations(
90 apparatus:ElementTree|Element,
91 verse:str,
92 witness:str="",
93 bracket_app:Element|None=None,
94 ignore_types:list[str]|None=None,
95 max_permutations:int=0,
96) -> list[Permutation]:
97 verse_element = get_verse_element(apparatus, verse)
98 if verse_element is None:
99 return []
101 permutations = [Permutation(text="", readings=[])]
103 apps = []
104 for child in verse_element.getchildren():
105 if not isinstance(child.tag, str):
106 continue
108 tag = re.sub(r"\{.*\}", "", child.tag)
109 if tag == "app":
110 has_witness = bool(witness) and app_has_witness(child, witness)
112 apps.append(child)
113 new_permutations = []
114 readings = find_readings(child, ignore_types=ignore_types)
115 for reading in readings:
116 if has_witness and not reading_has_witness(reading, witness):
117 continue
119 reading_text = extract_text(reading) or ""
120 if bracket_app is not None and bracket_app == child:
121 reading_text = f"⸂{reading_text}⸃"
123 for permutation in permutations:
124 new_permutation = Permutation(text=permutation.text + " " + reading_text, readings=permutation.readings + [reading])
125 new_permutations.append(new_permutation)
126 permutations = new_permutations
127 else:
128 for permutation in permutations:
129 permutation.text = (permutation.text + " " + extract_text(child)).strip()
131 def clean_text(text:str) -> str:
132 return re.sub(r"\s+", " ", text.strip())
134 perumutations = [Permutation(text=clean_text(permutation.text), readings=permutation.readings, apps=apps) for permutation in permutations]
136 if max_permutations and len(perumutations) > max_permutations:
137 return [perumutations[index] for index in range(0, len(perumutations), len(perumutations)//max_permutations)]
138 # import kmedoids
139 # import numpy as np
140 # import Levenshtein
142 # distance_matrix = np.zeros((len(perumutations), len(perumutations)))
143 # for index1, permutation1 in enumerate(perumutations):
144 # for index2 in range(index1+1, len(perumutations)):
145 # permutation2 = perumutations[index2]
146 # distance = Levenshtein.distance(permutation1.text, permutation2.text)
147 # distance_matrix[index1, index2] = distance
148 # distance_matrix[index2, index1] = distance
150 # result = kmedoids.fasterpam(distance_matrix, max_permutations, random_state=random_state, init="build")
152 # permutations = [perumutations[index] for index in result.medoids]
154 return permutations
158# def extract_text(node:Element, include_tail:bool=True) -> str:
159# text = node.text or ""
160# for child in node:
161# if isinstance(child.tag, str):
162# tag = re.sub(r"{.*}", "", child.tag)
163# else:
164# continue
166# if tag in ["pc", "witDetail", "note"]:
167# continue
168# if tag == "app":
169# breakpoint()
170# lemma = find_element(child, ".//lem")
171# if lemma is None:
172# lemma = find_element(child, ".//rdg")
173# text += extract_text(lemma) or ""
174# text += " "
175# elif tag == "ref":
176# root = child.getroottree().getroot()
177# target_id = child.attrib['target'].lstrip("#")
178# ns = {"tei": "http://www.tei-c.org/ns/1.0"}
179# target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns)
181# breakpoint()
182# child_text = extract_text(target[0]) if target else extract_text(child)
183# text += child_text or ""
184# else:
185# text += extract_text(child) or ""
187# if tag == "w":
188# text += " "
190# if include_tail:
191# text += node.tail or ""
193# return text
196def get_verse_element(doc:ElementTree|Element, verse:str) -> Element|None:
197 return find_element(doc, f".//ab[@n='{verse}']")
200def get_verse_text(doc:ElementTree|Element, verse:str) -> str|None:
201 verse_element = get_verse_element(doc, verse)
202 if verse_element is None:
203 return None
205 return extract_text(verse_element).strip()
208def add_witness_readings( readings:Element|list[Element], siglum:str) -> None:
209 if isinstance(readings, Element):
210 readings = [readings]
212 for reading in readings:
213 if 'wit' not in reading.attrib:
214 reading.attrib['wit'] = ""
216 if reading_has_witness(reading, siglum):
217 continue
219 if not siglum.startswith("#"):
220 siglum = "#" + siglum
221 reading.attrib['wit'] += f" {siglum}"
222 reading.attrib['wit'] = reading.attrib['wit'].strip()
225def remove_witnesss_readings(readings:Element|list[Element], siglum:str) -> None:
226 if isinstance(readings, Element):
227 readings = [readings]
229 for reading in readings:
230 if not reading_has_witness(reading, siglum):
231 continue
233 witnesses = reading.attrib['wit'].split()
234 witnesses = [witness for witness in witnesses if witness != siglum and witness != f"#{siglum}"]
235 reading.attrib['wit'] = " ".join(witnesses)
238def write_tei(doc:ElementTree, path:Path|str) -> None:
239 Path(path).parent.mkdir(parents=True, exist_ok=True)
240 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True)
243def get_witness_list(apparatus:ElementTree|Element) -> Element:
244 list_wit = find_element(apparatus, ".//listWit")
245 if list_wit is None:
246 raise ValueError("Could not find <listWit> element in the apparatus.")
248 return list_wit
251def add_siglum(apparatus:ElementTree|Element, siglum:str) -> Element:
252 if isinstance(apparatus, ElementTree):
253 apparatus = apparatus.getroot()
255 """ Adds a <witness> element to the <listWit> element in the apparatus."""
256 list_wit = get_witness_list(apparatus)
258 # Check if the witness already exists
259 witness_element = find_element(list_wit, f".//witness[@n='{siglum}']")
260 if not witness_element:
261 witness_element = ET.Element("witness", attrib={"n": siglum})
262 list_wit.append(witness_element)
264 return witness_element
267def has_witness(apparatus:ElementTree|Element, siglum:str) -> bool:
268 list_wit = get_witness_list(apparatus)
269 return find_element(list_wit, f".//witness[@n='{siglum}']") is not None
272def reading_has_witness(reading:Element, siglum:str) -> bool:
273 if 'wit' not in reading.attrib:
274 return False
276 witnesses = reading.attrib['wit'].split()
277 return (siglum in witnesses or f"#{siglum}" in witnesses)
280def add_wit_detail(apps:Element|set[Element], siglum:str, note:str="", phrase:str="", phrase_lang:str="", resp_id:str="VorlageLLM") -> None:
281 if isinstance(apps, Element):
282 apps = [apps]
283 for app in apps:
284 wit_detail = ET.SubElement(app, "witDetail", wit=siglum, resp=f"#{resp_id}")
285 if phrase:
286 phrase_element = ET.SubElement(wit_detail, "phr")
287 phrase_element.text = phrase
288 if phrase_lang:
289 phrase_element.attrib['{http://www.w3.org/XML/1998/namespace}lang'] = phrase_lang
290 if note:
291 ET.SubElement(wit_detail, "note").text = note
294def find_parent(element:Element, tag:str) -> Element|None:
295 """
296 Finds the nearest ancestor of the given element with the specified tag.
298 Args:
299 element (Element): The starting XML element from which to search upward.
300 tag (str): The tag name of the ancestor element to find.
302 Returns:
303 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found.
305 Example:
306 >>> from xml.etree.ElementTree import Element
307 >>> root = Element('root')
308 >>> ab = Element('ab')
309 >>> section = Element('section')
310 >>> target = Element('target')
311 >>> root.append(ab)
312 >>> ab.append(section)
313 >>> section.append(target)
314 >>> result = find_parent(target, 'ab')
315 >>> assert result == ab
317 This will find the <ab> ancestor of the <target> element.
318 """
319 while element is not None:
320 element_tag = re.sub(r"{.*}", "", element.tag)
321 if element_tag == tag:
322 return element
323 element = element.getparent()
324 return None
327def strip_namespace(element: Element) -> Element:
328 """Remove namespace from an element and its children."""
329 element.tag = element.tag.split('}', 1)[-1] # Remove namespace
330 for elem in element.iter():
331 if isinstance(elem, str):
332 elem.tag = elem.tag.split('}', 1)[-1] # Remove namespace
333 return element
336def write_elements(elements:list[Element], output_file:Path, root_tag:str="body", **kwargs) -> None:
337 """
338 Writes a list of XML elements to a file, wrapping them in a specified root element.
340 Args:
341 elements (list[Element]): List of XML elements to be written.
342 output_file (Path): Path object specifying the file where the XML will be written.
343 root_tag (str): Tag name for the root element that will wrap the elements. Defaults to "body".
345 Returns:
346 None: This function does not return any value. It writes the XML structure to the specified file.
347 """
348 root = new_element(root_tag, **kwargs)
349 for element in elements:
350 if element is not None:
351 root.append(strip_namespace(element))
353 tree = new_element_tree(root)
354 write_tei(tree, output_file)
357def get_apparatus_verse_text(app:Element, witness:str="") -> str:
358 parent = find_parent(app, 'ab')
359 text = parent.text or ""
360 text = text.strip()
361 text += " "
362 for child in parent:
363 if isinstance(child.tag, str):
364 tag = re.sub(r"{.*}", "", child.tag)
365 else:
366 continue
368 if tag in ["pc", "witDetail", "note"]:
369 continue
371 if tag == "app":
372 if witness and app_has_witness(child, witness):
373 for reading in find_elements(child, ".//rdg"):
374 if reading_has_witness(reading, witness):
375 lemma = reading
376 break
377 else:
378 lemma = find_element(child, ".//lem")
379 if lemma is None:
380 lemma = find_element(child, ".//rdg")
381 if lemma is None:
382 lemma = app
383 app_text = extract_text(lemma, include_tail=False) or ""
384 app_text = app_text.strip()
385 if child == app:
386 app_text = f"⸂{app_text}⸃"
387 text += app_text
388 if app.tail:
389 text += " " + app.tail.strip()
390 else:
391 child_text = extract_text(child) or ""
392 child_text = child_text.strip()
393 text += child_text or ""
395 text += " "
397 text += parent.tail or ""
398 text = re.sub(r"\s+", " ", text.strip())
399 return text
402def extract_text(node:Element, include_tail:bool=True, strip:bool=True) -> str:
403 if node is None:
404 return ""
406 if isinstance(node.tag, str):
407 tag = re.sub(r"{.*}", "", node.tag)
408 else:
409 return ""
411 if tag in ["pc", "witDetail", "note"]:
412 return ""
413 if tag == "app":
414 lemma = find_element(node, ".//lem")
415 if lemma is None:
416 lemma = find_element(node, ".//rdg")
417 if lemma:
418 return extract_text(lemma, strip=False) or ""
419 if tag == "ref":
420 root = node.getroottree().getroot()
421 target_id = node.attrib['target'].lstrip("#")
422 ns = {"tei": "http://www.tei-c.org/ns/1.0"}
423 target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns)
425 if target:
426 return extract_text(target[0], strip=strip)
428 text = node.text or ""
429 for child in node:
430 text += extract_text(child, strip=False)
432 if include_tail and node.tail:
433 text += node.tail
435 text = re.sub(r"\s+", " ", text)
437 if tag == "w" or (tag == "lb" and node.attrib.get("break", "").lower() != "no"):
438 text += " "
440 if strip:
441 text = text.strip()
443 return text
446def readings_for_witness(app:Element, siglum:str) -> set[Element]:
447 """
448 Collects readings associated with a specific witness from an XML apparatus entry.
450 Args:
451 app (Element): The XML element representing the apparatus entry.
452 siglum (str): The siglum of the witness to filter readings by.
454 Returns:
455 set[Element]: A set of reading elements that include the specified witness.
456 """
457 readings = find_elements(app, ".//rdg")
458 return set(reading for reading in readings if reading_has_witness(reading, siglum))
461def add_doc_metadata(witness_element:Element, doc:ElementTree) -> Element:
462 """
463 Adds metadata from the `doc` element tree to the `witness_element`.
465 This function searches for a `biblFull` element within the given `witness_element`.
466 If it does not exist, a new `biblFull` element is created. It then copies all children
467 from the `fileDesc` element in the `doc` element tree and appends them to the `biblFull` element.
469 Args:
470 witness_element (Element): The XML element representing the witness to which metadata will be added.
471 doc (ElementTree): The XML element tree containing the source metadata.
473 Returns:
474 Element: The `biblFull` element within the `witness_element` containing the appended metadata.
475 """
476 bibl_full = find_element(witness_element, ".//biblFull")
477 if bibl_full is None:
478 bibl_full = ET.SubElement(witness_element, "biblFull")
479 file_description = find_element(doc, ".//fileDesc")
480 if file_description is not None:
481 for child in file_description:
482 new_child = copy.deepcopy(child)
483 bibl_full.append(new_child)
484 return bibl_full
487def add_responsibility_statement(doc:ElementTree, xml_id:str, description:str) -> tuple[Element,str]:
488 """
489 Adds a responsibility statement to the XML document.
491 Args:
492 doc (ElementTree): The XML document to which the responsibility statement will be added.
493 siglum (str): The siglum of the witness.
494 description (str): The description of the responsibility statement.
496 Returns:
497 Element: The responsibility statement element that was added to the document.
498 str: The unique ID of the responsibility statement.
499 """
500 if isinstance(doc, ElementTree):
501 doc = doc.getroot()
503 header = find_element(doc, ".//teiHeader")
504 if header is None:
505 header = ET.SubElement(doc, "teiHeader")
507 file_description = find_element(header, ".//fileDesc")
508 if file_description is None:
509 file_description = ET.SubElement(header, "fileDesc")
511 title_statement = find_element(file_description, ".//titleStmt")
512 if title_statement is None:
513 title_statement = ET.SubElement(file_description, "titleStmt")
515 # Get unique ID
516 counter = 1
517 while find_element(title_statement, f".//respStmt[@{{http://www.w3.org/XML/1998/namespace}}id='{xml_id}']") is not None:
518 counter += 1
519 xml_id = f"VorlageLLM-{counter}"
521 responsibility_statement = ET.SubElement(title_statement, "respStmt", )
522 responsibility_statement.attrib['{http://www.w3.org/XML/1998/namespace}id'] = xml_id
524 # Get datetime in required format
525 current_time = datetime.now()
526 formatted_time = current_time.strftime('%Y-%m-%dT%H:%M:%S')
528 resp = ET.SubElement(responsibility_statement, "resp", when=formatted_time)
529 resp.text = description
531 return responsibility_statement, xml_id
534def add_responsibility_statement_llm(doc:ElementTree, siglum:str, model_id:str) -> tuple[Element,str]:
535 """
536 Adds a responsibility statement to the XML document.
538 Args:
539 doc (ElementTree): The XML document to which the responsibility statement will be added.
540 siglum (str): The siglum of the witness.
541 model_id (str): The ID of the LLM model used.
543 Returns:
544 Element: The responsibility statement element that was added to the document.
545 str: The unique ID of the responsibility statement.
546 """
547 description = f"Witness '{siglum}' added using VorlageLLM using LLM '{model_id}'"
548 xml_id = f"VorlageLLM-{siglum}-{model_id}"
549 return add_responsibility_statement(doc, xml_id, description)