Coverage for rdgai/tei.py: 93.26%

89 statements  

« prev     ^ index     » next       coverage.py v7.11.1, created at 2025-11-08 01:02 +0000

1import re 

2from pathlib import Path 

3from lxml import etree as ET 

4from lxml.etree import _ElementTree as ElementTree 

5from lxml.etree import _Element as Element 

6 

7from .languages import convert_language_code 

8 

9 

10def get_language_code(doc:ElementTree|Element) -> str: 

11 """ Reads the element <text> and returns the value of the xml:lang attribute.""" 

12 text = find_element(doc, ".//text") 

13 if text is None: 

14 return "" 

15 

16 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "") 

17 

18 

19def get_language(doc:ElementTree|Element) -> str: 

20 code = get_language_code(doc) 

21 return convert_language_code(code) 

22 

23 

24def make_nc_name(string): 

25 invalid_chars = "!\"#$%&'()*+/:;<=>?@[\\]^,{|}~` " 

26 result = string.translate(str.maketrans(invalid_chars, '_' * len(invalid_chars))) 

27 # if result[0].isdigit or result[0] in [".", "-"]: 

28 # result = "id-" + result 

29 

30 return result 

31 

32 

33def extract_text(node:Element, include_tail:bool=True) -> str: 

34 if node is None: 

35 return "" 

36 

37 tag = re.sub(r"{.*}", "", node.tag) 

38 

39 if tag in ["pc", "witDetail", "note"]: 

40 return "" 

41 if tag == "app": 

42 lemma = find_element(node, ".//lem") 

43 if lemma is None: 

44 lemma = find_element(node, ".//rdg") 

45 return extract_text(lemma) or "" 

46 if tag == "ref": 

47 root = node.getroottree().getroot() 

48 target_id = node.attrib['target'].lstrip("#") 

49 ns = {"tei": "http://www.tei-c.org/ns/1.0"} 

50 target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns) 

51 

52 if target: 

53 return extract_text(target[0]) 

54 

55 

56 text = node.text or "" 

57 for child in node: 

58 text += " " + extract_text(child) 

59 

60 if include_tail and node.tail: 

61 text += " " + node.tail 

62 

63 return text.strip() 

64 

65 

66def read_tei(path:Path) -> ElementTree: 

67 parser = ET.XMLParser(remove_blank_text=True) 

68 with open(path, 'r') as f: 

69 return ET.parse(f, parser) 

70 

71 

72def find_element(doc:ElementTree|Element, xpath:str) -> Element|None: 

73 assert doc is not None, f"Document is None in find_element({doc}, {xpath})" 

74 if isinstance(doc, ElementTree): 

75 doc = doc.getroot() 

76 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"} 

77 element = doc.find(xpath, namespaces=namespaces) 

78 if element is None: 

79 try: 

80 element = doc.find(xpath) 

81 except SyntaxError: 

82 return None 

83 return element 

84 

85 

86def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None: 

87 if isinstance(doc, ElementTree): 

88 doc = doc.getroot() 

89 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"} 

90 results = doc.findall(xpath, namespaces=namespaces) 

91 results += doc.findall(xpath) 

92 return results 

93 

94 

95def find_parent(element:Element, tag:str) -> Element|None: 

96 """ 

97 Finds the nearest ancestor of the given element with the specified tag. 

98 

99 Args: 

100 element (Element): The starting XML element from which to search upward. 

101 tag (str): The tag name of the ancestor element to find. 

102 

103 Returns: 

104 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found. 

105 

106 Example: 

107 >>> from xml.etree.ElementTree import Element 

108 >>> root = Element('root') 

109 >>> ab = Element('ab') 

110 >>> section = Element('section') 

111 >>> target = Element('target') 

112 >>> root.append(ab) 

113 >>> ab.append(section) 

114 >>> section.append(target) 

115 >>> result = find_parent(target, 'ab') 

116 >>> assert result == ab 

117 

118 This will find the <ab> ancestor of the <target> element. 

119 """ 

120 while element is not None: 

121 element_tag = re.sub(r"{.*}", "", element.tag) 

122 if element_tag == tag: 

123 return element 

124 element = element.getparent() 

125 return None 

126 

127 

128def write_tei(doc:ElementTree, path:Path|str) -> None: 

129 Path(path).parent.mkdir(parents=True, exist_ok=True) 

130 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True) 

131 

132 

133def get_reading_identifier(reading:Element, check:bool=False, create_if_necessary:bool=True) -> str: 

134 identifier = reading.attrib.get("{http://www.w3.org/XML/1998/namespace}id", "") 

135 if not identifier: 

136 identifier = reading.attrib.get("n", "") 

137 

138 if not identifier and create_if_necessary: 

139 app = reading.getparent() 

140 identifier = 1 

141 while find_element(app, f".//rdg[@n='{identifier}']") is not None: 

142 identifier += 1 

143 identifier = str(identifier) 

144 reading.attrib["n"] = identifier 

145 

146 if check: 

147 assert identifier, f"Reading {reading} must have a name attribute 'xml:id' or 'n'." 

148 

149 return identifier