Coverage for rdgai/tei.py: 100.00%
82 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-03 01:37 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-03 01:37 +0000
1import re
2from pathlib import Path
3from lxml import etree as ET
4from lxml.etree import _ElementTree as ElementTree
5from lxml.etree import _Element as Element
7from .languages import convert_language_code
10def get_language_code(doc:ElementTree|Element) -> str:
11 """ Reads the element <text> and returns the value of the xml:lang attribute."""
12 text = find_element(doc, ".//text")
13 if text is None:
14 return ""
16 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "")
19def get_language(doc:ElementTree|Element) -> str:
20 code = get_language_code(doc)
21 return convert_language_code(code)
24def make_nc_name(string):
25 invalid_chars = "!\"#$%&'()*+/:;<=>?@[\\]^,{|}~` "
26 result = string.translate(str.maketrans(invalid_chars, '_' * len(invalid_chars)))
27 # if result[0].isdigit or result[0] in [".", "-"]:
28 # result = "id-" + result
30 return result
33def extract_text(node:Element, include_tail:bool=True) -> str:
34 if node is None:
35 return ""
37 tag = re.sub(r"{.*}", "", node.tag)
39 if tag in ["pc", "witDetail", "note"]:
40 return ""
41 if tag == "app":
42 lemma = find_element(node, ".//lem")
43 if lemma is None:
44 lemma = find_element(node, ".//rdg")
45 return extract_text(lemma) or ""
48 text = node.text or ""
49 for child in node:
50 text += " " + extract_text(child)
52 if include_tail and node.tail:
53 text += " " + node.tail
55 return text.strip()
60def read_tei(path:Path) -> ElementTree:
61 parser = ET.XMLParser(remove_blank_text=True)
62 with open(path, 'r') as f:
63 return ET.parse(f, parser)
66def find_element(doc:ElementTree|Element, xpath:str) -> Element|None:
67 assert doc is not None, f"Document is None in find_element({doc}, {xpath})"
68 if isinstance(doc, ElementTree):
69 doc = doc.getroot()
70 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"}
71 element = doc.find(xpath, namespaces=namespaces)
72 if element is None:
73 try:
74 element = doc.find(xpath)
75 except SyntaxError:
76 return None
77 return element
80def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None:
81 if isinstance(doc, ElementTree):
82 doc = doc.getroot()
83 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"}
84 results = doc.findall(xpath, namespaces=namespaces)
85 results += doc.findall(xpath)
86 return results
89def find_parent(element:Element, tag:str) -> Element|None:
90 """
91 Finds the nearest ancestor of the given element with the specified tag.
93 Args:
94 element (Element): The starting XML element from which to search upward.
95 tag (str): The tag name of the ancestor element to find.
97 Returns:
98 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found.
100 Example:
101 >>> from xml.etree.ElementTree import Element
102 >>> root = Element('root')
103 >>> ab = Element('ab')
104 >>> section = Element('section')
105 >>> target = Element('target')
106 >>> root.append(ab)
107 >>> ab.append(section)
108 >>> section.append(target)
109 >>> result = find_parent(target, 'ab')
110 >>> assert result == ab
112 This will find the <ab> ancestor of the <target> element.
113 """
114 while element is not None:
115 element_tag = re.sub(r"{.*}", "", element.tag)
116 if element_tag == tag:
117 return element
118 element = element.getparent()
119 return None
122def write_tei(doc:ElementTree, path:Path|str) -> None:
123 Path(path).parent.mkdir(parents=True, exist_ok=True)
124 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True)
127def get_reading_identifier(reading:Element, check:bool=False, create_if_necessary:bool=True) -> str:
128 identifier = reading.attrib.get("{http://www.w3.org/XML/1998/namespace}id", "")
129 if not identifier:
130 identifier = reading.attrib.get("n", "")
132 if not identifier and create_if_necessary:
133 app = reading.getparent()
134 identifier = 1
135 while find_element(app, f".//rdg[@n='{identifier}']") is not None:
136 identifier += 1
137 identifier = str(identifier)
138 reading.attrib["n"] = identifier
140 if check:
141 assert identifier, f"Reading {reading} must have a name attribute 'xml:id' or 'n'."
143 return identifier