Coverage for rdgai/tei.py: 100.00%

82 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-03 01:37 +0000

1import re 

2from pathlib import Path 

3from lxml import etree as ET 

4from lxml.etree import _ElementTree as ElementTree 

5from lxml.etree import _Element as Element 

6 

7from .languages import convert_language_code 

8 

9 

10def get_language_code(doc:ElementTree|Element) -> str: 

11 """ Reads the element <text> and returns the value of the xml:lang attribute.""" 

12 text = find_element(doc, ".//text") 

13 if text is None: 

14 return "" 

15 

16 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "") 

17 

18 

19def get_language(doc:ElementTree|Element) -> str: 

20 code = get_language_code(doc) 

21 return convert_language_code(code) 

22 

23 

24def make_nc_name(string): 

25 invalid_chars = "!\"#$%&'()*+/:;<=>?@[\\]^,{|}~` " 

26 result = string.translate(str.maketrans(invalid_chars, '_' * len(invalid_chars))) 

27 # if result[0].isdigit or result[0] in [".", "-"]: 

28 # result = "id-" + result 

29 

30 return result 

31 

32 

33def extract_text(node:Element, include_tail:bool=True) -> str: 

34 if node is None: 

35 return "" 

36 

37 tag = re.sub(r"{.*}", "", node.tag) 

38 

39 if tag in ["pc", "witDetail", "note"]: 

40 return "" 

41 if tag == "app": 

42 lemma = find_element(node, ".//lem") 

43 if lemma is None: 

44 lemma = find_element(node, ".//rdg") 

45 return extract_text(lemma) or "" 

46 

47 

48 text = node.text or "" 

49 for child in node: 

50 text += " " + extract_text(child) 

51 

52 if include_tail and node.tail: 

53 text += " " + node.tail 

54 

55 return text.strip() 

56 

57 

58 

59 

60def read_tei(path:Path) -> ElementTree: 

61 parser = ET.XMLParser(remove_blank_text=True) 

62 with open(path, 'r') as f: 

63 return ET.parse(f, parser) 

64 

65 

66def find_element(doc:ElementTree|Element, xpath:str) -> Element|None: 

67 assert doc is not None, f"Document is None in find_element({doc}, {xpath})" 

68 if isinstance(doc, ElementTree): 

69 doc = doc.getroot() 

70 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"} 

71 element = doc.find(xpath, namespaces=namespaces) 

72 if element is None: 

73 try: 

74 element = doc.find(xpath) 

75 except SyntaxError: 

76 return None 

77 return element 

78 

79 

80def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None: 

81 if isinstance(doc, ElementTree): 

82 doc = doc.getroot() 

83 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"} 

84 results = doc.findall(xpath, namespaces=namespaces) 

85 results += doc.findall(xpath) 

86 return results 

87 

88 

89def find_parent(element:Element, tag:str) -> Element|None: 

90 """ 

91 Finds the nearest ancestor of the given element with the specified tag. 

92 

93 Args: 

94 element (Element): The starting XML element from which to search upward. 

95 tag (str): The tag name of the ancestor element to find. 

96 

97 Returns: 

98 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found. 

99 

100 Example: 

101 >>> from xml.etree.ElementTree import Element 

102 >>> root = Element('root') 

103 >>> ab = Element('ab') 

104 >>> section = Element('section') 

105 >>> target = Element('target') 

106 >>> root.append(ab) 

107 >>> ab.append(section) 

108 >>> section.append(target) 

109 >>> result = find_parent(target, 'ab') 

110 >>> assert result == ab 

111 

112 This will find the <ab> ancestor of the <target> element. 

113 """ 

114 while element is not None: 

115 element_tag = re.sub(r"{.*}", "", element.tag) 

116 if element_tag == tag: 

117 return element 

118 element = element.getparent() 

119 return None 

120 

121 

122def write_tei(doc:ElementTree, path:Path|str) -> None: 

123 Path(path).parent.mkdir(parents=True, exist_ok=True) 

124 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True) 

125 

126 

127def get_reading_identifier(reading:Element, check:bool=False, create_if_necessary:bool=True) -> str: 

128 identifier = reading.attrib.get("{http://www.w3.org/XML/1998/namespace}id", "") 

129 if not identifier: 

130 identifier = reading.attrib.get("n", "") 

131 

132 if not identifier and create_if_necessary: 

133 app = reading.getparent() 

134 identifier = 1 

135 while find_element(app, f".//rdg[@n='{identifier}']") is not None: 

136 identifier += 1 

137 identifier = str(identifier) 

138 reading.attrib["n"] = identifier 

139 

140 if check: 

141 assert identifier, f"Reading {reading} must have a name attribute 'xml:id' or 'n'." 

142 

143 return identifier