Coverage for rdgai/tei.py: 100.00%

1import re

2from pathlib import Path

3from lxml import etree as ET

4from lxml.etree import _ElementTree as ElementTree

5from lxml.etree import _Element as Element

7from .languages import convert_language_code

10def get_language_code(doc:ElementTree|Element) -> str:

11 """ Reads the element <text> and returns the value of the xml:lang attribute."""

12 text = find_element(doc, ".//text")

13 if text is None:

14 return ""

16 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "")

19def get_language(doc:ElementTree|Element) -> str:

20 code = get_language_code(doc)

21 return convert_language_code(code)

24def make_nc_name(string):

25 invalid_chars = "!\"#$%&'()*+/:;<=>?@[\\]^,{|}~` "

26 result = string.translate(str.maketrans(invalid_chars, '_' * len(invalid_chars)))

27 # if result[0].isdigit or result[0] in [".", "-"]:

28 # result = "id-" + result

30 return result

33def extract_text(node:Element, include_tail:bool=True) -> str:

34 if node is None:

35 return ""

37 tag = re.sub(r"{.*}", "", node.tag)

39 if tag in ["pc", "witDetail", "note"]:

40 return ""

41 if tag == "app":

42 lemma = find_element(node, ".//lem")

43 if lemma is None:

44 lemma = find_element(node, ".//rdg")

45 return extract_text(lemma) or ""

48 text = node.text or ""

49 for child in node:

50 text += " " + extract_text(child)

52 if include_tail and node.tail:

53 text += " " + node.tail

55 return text.strip()

60def read_tei(path:Path) -> ElementTree:

61 parser = ET.XMLParser(remove_blank_text=True)

62 with open(path, 'r') as f:

63 return ET.parse(f, parser)

66def find_element(doc:ElementTree|Element, xpath:str) -> Element|None:

67 assert doc is not None, f"Document is None in find_element({doc}, {xpath})"

68 if isinstance(doc, ElementTree):

69 doc = doc.getroot()

70 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"}

71 element = doc.find(xpath, namespaces=namespaces)

72 if element is None:

73 try:

74 element = doc.find(xpath)

75 except SyntaxError:

76 return None

77 return element

80def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None:

81 if isinstance(doc, ElementTree):

82 doc = doc.getroot()

83 namespaces = doc.nsmap | {"xml": "http://www.w3.org/XML/1998/namespace"}

84 results = doc.findall(xpath, namespaces=namespaces)

85 results += doc.findall(xpath)

86 return results

89def find_parent(element:Element, tag:str) -> Element|None:

90 """

91 Finds the nearest ancestor of the given element with the specified tag.

93 Args:

94 element (Element): The starting XML element from which to search upward.

95 tag (str): The tag name of the ancestor element to find.

97 Returns:

98 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found.

100 Example:

101 >>> from xml.etree.ElementTree import Element

102 >>> root = Element('root')

103 >>> ab = Element('ab')

104 >>> section = Element('section')

105 >>> target = Element('target')

106 >>> root.append(ab)

107 >>> ab.append(section)

108 >>> section.append(target)

109 >>> result = find_parent(target, 'ab')

110 >>> assert result == ab

111

112 This will find the <ab> ancestor of the <target> element.

113 """

114 while element is not None:

115 element_tag = re.sub(r"{.*}", "", element.tag)

116 if element_tag == tag:

117 return element

118 element = element.getparent()

119 return None

120

121

122def write_tei(doc:ElementTree, path:Path|str) -> None:

123 Path(path).parent.mkdir(parents=True, exist_ok=True)

124 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True)

125

126

127def get_reading_identifier(reading:Element, check:bool=False, create_if_necessary:bool=True) -> str:

128 identifier = reading.attrib.get("{http://www.w3.org/XML/1998/namespace}id", "")

129 if not identifier:

130 identifier = reading.attrib.get("n", "")

131

132 if not identifier and create_if_necessary:

133 app = reading.getparent()

134 identifier = 1

135 while find_element(app, f".//rdg[@n='{identifier}']") is not None:

136 identifier += 1

137 identifier = str(identifier)

138 reading.attrib["n"] = identifier

139

140 if check:

141 assert identifier, f"Reading {reading} must have a name attribute 'xml:id' or 'n'."

142

143 return identifier