Coverage for rdgai/tei.py: 93.26%
89 statements
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-08 01:02 +0000
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-08 01:02 +0000
1import re
2from pathlib import Path
3from lxml import etree as ET
4from lxml.etree import _ElementTree as ElementTree
5from lxml.etree import _Element as Element
7from .languages import convert_language_code
10def get_language_code(doc:ElementTree|Element) -> str:
11 """ Reads the element <text> and returns the value of the xml:lang attribute."""
12 text = find_element(doc, ".//text")
13 if text is None:
14 return ""
16 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "")
19def get_language(doc:ElementTree|Element) -> str:
20 code = get_language_code(doc)
21 return convert_language_code(code)
24def make_nc_name(string):
25 invalid_chars = "!\"#$%&'()*+/:;<=>?@[\\]^,{|}~` "
26 result = string.translate(str.maketrans(invalid_chars, '_' * len(invalid_chars)))
27 # if result[0].isdigit or result[0] in [".", "-"]:
28 # result = "id-" + result
30 return result
33def extract_text(node:Element, include_tail:bool=True) -> str:
34 if node is None:
35 return ""
37 tag = re.sub(r"{.*}", "", node.tag)
39 if tag in ["pc", "witDetail", "note"]:
40 return ""
41 if tag == "app":
42 lemma = find_element(node, ".//lem")
43 if lemma is None:
44 lemma = find_element(node, ".//rdg")
45 return extract_text(lemma) or ""
46 if tag == "ref":
47 root = node.getroottree().getroot()
48 target_id = node.attrib['target'].lstrip("#")
49 ns = {"tei": "http://www.tei-c.org/ns/1.0"}
50 target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns)
52 if target:
53 return extract_text(target[0])
56 text = node.text or ""
57 for child in node:
58 text += " " + extract_text(child)
60 if include_tail and node.tail:
61 text += " " + node.tail
63 return text.strip()
66def read_tei(path:Path) -> ElementTree:
67 parser = ET.XMLParser(remove_blank_text=True)
68 with open(path, 'r') as f:
69 return ET.parse(f, parser)
72def find_element(doc:ElementTree|Element, xpath:str) -> Element|None:
73 assert doc is not None, f"Document is None in find_element({doc}, {xpath})"
74 if isinstance(doc, ElementTree):
75 doc = doc.getroot()
76 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"}
77 element = doc.find(xpath, namespaces=namespaces)
78 if element is None:
79 try:
80 element = doc.find(xpath)
81 except SyntaxError:
82 return None
83 return element
86def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None:
87 if isinstance(doc, ElementTree):
88 doc = doc.getroot()
89 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"}
90 results = doc.findall(xpath, namespaces=namespaces)
91 results += doc.findall(xpath)
92 return results
95def find_parent(element:Element, tag:str) -> Element|None:
96 """
97 Finds the nearest ancestor of the given element with the specified tag.
99 Args:
100 element (Element): The starting XML element from which to search upward.
101 tag (str): The tag name of the ancestor element to find.
103 Returns:
104 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found.
106 Example:
107 >>> from xml.etree.ElementTree import Element
108 >>> root = Element('root')
109 >>> ab = Element('ab')
110 >>> section = Element('section')
111 >>> target = Element('target')
112 >>> root.append(ab)
113 >>> ab.append(section)
114 >>> section.append(target)
115 >>> result = find_parent(target, 'ab')
116 >>> assert result == ab
118 This will find the <ab> ancestor of the <target> element.
119 """
120 while element is not None:
121 element_tag = re.sub(r"{.*}", "", element.tag)
122 if element_tag == tag:
123 return element
124 element = element.getparent()
125 return None
128def write_tei(doc:ElementTree, path:Path|str) -> None:
129 Path(path).parent.mkdir(parents=True, exist_ok=True)
130 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True)
133def get_reading_identifier(reading:Element, check:bool=False, create_if_necessary:bool=True) -> str:
134 identifier = reading.attrib.get("{http://www.w3.org/XML/1998/namespace}id", "")
135 if not identifier:
136 identifier = reading.attrib.get("n", "")
138 if not identifier and create_if_necessary:
139 app = reading.getparent()
140 identifier = 1
141 while find_element(app, f".//rdg[@n='{identifier}']") is not None:
142 identifier += 1
143 identifier = str(identifier)
144 reading.attrib["n"] = identifier
146 if check:
147 assert identifier, f"Reading {reading} must have a name attribute 'xml:id' or 'n'."
149 return identifier