Coverage for vorlagellm/tei.py: 87.59%

1from pathlib import Path

2from lxml import etree as ET

3from lxml.etree import _ElementTree as ElementTree

4from lxml.etree import _Element as Element

5from lxml.etree import Element as new_element

6from lxml.etree import ElementTree as new_element_tree

7import re

8from dataclasses import dataclass

9from datetime import datetime

10import copy

12from .languages import convert_language_code

15@dataclass

16class Permutation:

17 text:str

18 readings:list[Element]

19 apps:list[Element]=None

22def read_tei(path:Path) -> ElementTree:

23 parser = ET.XMLParser(remove_blank_text=True)

24 with open(path, 'r') as f:

25 return ET.parse(f, parser)

28def find_element(doc:ElementTree|Element, xpath:str) -> Element|None:

29 if isinstance(doc, ElementTree):

30 doc = doc.getroot()

31 element = doc.find(xpath, namespaces=doc.nsmap)

32 if element is None:

33 element = doc.find(xpath)

34 return element

37def find_elements(doc:ElementTree|Element, xpath:str) -> Element|None:

38 if isinstance(doc, ElementTree):

39 doc = doc.getroot()

40 return doc.findall(xpath, namespaces=doc.nsmap)

43def get_siglum(doc:ElementTree|Element) -> str:

44 """

45 Get the attribute of the 'n' attribute in the <title type="document"> element.

47 Returns an empty string if the element is not found.

48 """

49 title = find_element(doc, ".//title[@type='document']")

50 if title is None:

51 return ""

53 return title.attrib.get('n', "")

56def get_language_code(doc:ElementTree|Element) -> str:

57 """ Reads the element <text> and returns the value of the xml:lang attribute."""

58 text = find_element(doc, ".//text")

59 if text is None:

60 return ""

62 return text.attrib.get("{http://www.w3.org/XML/1998/namespace}lang", "")

65def get_language(doc:ElementTree|Element) -> str:

66 code = get_language_code(doc)

67 return convert_language_code(code)

70def app_has_witness(app:Element, siglum:str) -> bool:

71 """ Returns True if the apparatus has a <rdg> element with the specified siglum."""

72 readings = find_elements(app, ".//rdg")

73 return any(reading_has_witness(reading, siglum) for reading in readings)

76def get_verses(doc:ElementTree|Element) -> list[str]:

77 """ Returns a list of "n" attributes in <ab> elements."""

78 ab_elements = find_elements(doc, ".//ab")

79 return [ab.attrib['n'] for ab in ab_elements if 'n' in ab.attrib]

82def find_readings(element, ignore_types:list[str]|None) -> list[Element]:

83 readings = find_elements(element, ".//rdg")

84 if ignore_types:

85 readings = [reading for reading in readings if reading.attrib.get("type", "") not in ignore_types]

86 return readings

89def get_reading_permutations(

90 apparatus:ElementTree|Element,

91 verse:str,

92 witness:str="",

93 bracket_app:Element|None=None,

94 ignore_types:list[str]|None=None,

95 max_permutations:int=0,

96) -> list[Permutation]:

97 verse_element = get_verse_element(apparatus, verse)

98 if verse_element is None:

99 return []

100

101 permutations = [Permutation(text="", readings=[])]

102

103 apps = []

104 for child in verse_element.getchildren():

105 if not isinstance(child.tag, str):

106 continue

107

108 tag = re.sub(r"\{.*\}", "", child.tag)

109 if tag == "app":

110 has_witness = bool(witness) and app_has_witness(child, witness)

111

112 apps.append(child)

113 new_permutations = []

114 readings = find_readings(child, ignore_types=ignore_types)

115 for reading in readings:

116 if has_witness and not reading_has_witness(reading, witness):

117 continue

118

119 reading_text = extract_text(reading) or ""

120 if bracket_app is not None and bracket_app == child:

121 reading_text = f"⸂{reading_text}⸃"

122

123 for permutation in permutations:

124 new_permutation = Permutation(text=permutation.text + " " + reading_text, readings=permutation.readings + [reading])

125 new_permutations.append(new_permutation)

126 permutations = new_permutations

127 else:

128 for permutation in permutations:

129 permutation.text = (permutation.text + " " + extract_text(child)).strip()

130

131 def clean_text(text:str) -> str:

132 return re.sub(r"\s+", " ", text.strip())

133

134 perumutations = [Permutation(text=clean_text(permutation.text), readings=permutation.readings, apps=apps) for permutation in permutations]

135

136 if max_permutations and len(perumutations) > max_permutations:

137 return [perumutations[index] for index in range(0, len(perumutations), len(perumutations)//max_permutations)]

138 # import kmedoids

139 # import numpy as np

140 # import Levenshtein

141

142 # distance_matrix = np.zeros((len(perumutations), len(perumutations)))

143 # for index1, permutation1 in enumerate(perumutations):

144 # for index2 in range(index1+1, len(perumutations)):

145 # permutation2 = perumutations[index2]

146 # distance = Levenshtein.distance(permutation1.text, permutation2.text)

147 # distance_matrix[index1, index2] = distance

148 # distance_matrix[index2, index1] = distance

149

150 # result = kmedoids.fasterpam(distance_matrix, max_permutations, random_state=random_state, init="build")

151

152 # permutations = [perumutations[index] for index in result.medoids]

153

154 return permutations

155

156

157

158# def extract_text(node:Element, include_tail:bool=True) -> str:

159# text = node.text or ""

160# for child in node:

161# if isinstance(child.tag, str):

162# tag = re.sub(r"{.*}", "", child.tag)

163# else:

164# continue

165

166# if tag in ["pc", "witDetail", "note"]:

167# continue

168# if tag == "app":

169# breakpoint()

170# lemma = find_element(child, ".//lem")

171# if lemma is None:

172# lemma = find_element(child, ".//rdg")

173# text += extract_text(lemma) or ""

174# text += " "

175# elif tag == "ref":

176# root = child.getroottree().getroot()

177# target_id = child.attrib['target'].lstrip("#")

178# ns = {"tei": "http://www.tei-c.org/ns/1.0"}

179# target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns)

180

181# breakpoint()

182# child_text = extract_text(target[0]) if target else extract_text(child)

183# text += child_text or ""

184# else:

185# text += extract_text(child) or ""

186

187# if tag == "w":

188# text += " "

189

190# if include_tail:

191# text += node.tail or ""

192

193# return text

194

195

196def get_verse_element(doc:ElementTree|Element, verse:str) -> Element|None:

197 return find_element(doc, f".//ab[@n='{verse}']")

198

199

200def get_verse_text(doc:ElementTree|Element, verse:str) -> str|None:

201 verse_element = get_verse_element(doc, verse)

202 if verse_element is None:

203 return None

204

205 return extract_text(verse_element).strip()

206

207

208def add_witness_readings( readings:Element|list[Element], siglum:str) -> None:

209 if isinstance(readings, Element):

210 readings = [readings]

211

212 for reading in readings:

213 if 'wit' not in reading.attrib:

214 reading.attrib['wit'] = ""

215

216 if reading_has_witness(reading, siglum):

217 continue

218

219 if not siglum.startswith("#"):

220 siglum = "#" + siglum

221 reading.attrib['wit'] += f" {siglum}"

222 reading.attrib['wit'] = reading.attrib['wit'].strip()

223

224

225def remove_witnesss_readings(readings:Element|list[Element], siglum:str) -> None:

226 if isinstance(readings, Element):

227 readings = [readings]

228

229 for reading in readings:

230 if not reading_has_witness(reading, siglum):

231 continue

232

233 witnesses = reading.attrib['wit'].split()

234 witnesses = [witness for witness in witnesses if witness != siglum and witness != f"#{siglum}"]

235 reading.attrib['wit'] = " ".join(witnesses)

236

237

238def write_tei(doc:ElementTree, path:Path|str) -> None:

239 Path(path).parent.mkdir(parents=True, exist_ok=True)

240 doc.write(str(path), encoding="utf-8", xml_declaration=True, pretty_print=True)

241

242

243def get_witness_list(apparatus:ElementTree|Element) -> Element:

244 list_wit = find_element(apparatus, ".//listWit")

245 if list_wit is None:

246 raise ValueError("Could not find <listWit> element in the apparatus.")

247

248 return list_wit

249

250

251def add_siglum(apparatus:ElementTree|Element, siglum:str) -> Element:

252 if isinstance(apparatus, ElementTree):

253 apparatus = apparatus.getroot()

254

255 """ Adds a <witness> element to the <listWit> element in the apparatus."""

256 list_wit = get_witness_list(apparatus)

257

258 # Check if the witness already exists

259 witness_element = find_element(list_wit, f".//witness[@n='{siglum}']")

260 if not witness_element:

261 witness_element = ET.Element("witness", attrib={"n": siglum})

262 list_wit.append(witness_element)

263

264 return witness_element

265

266

267def has_witness(apparatus:ElementTree|Element, siglum:str) -> bool:

268 list_wit = get_witness_list(apparatus)

269 return find_element(list_wit, f".//witness[@n='{siglum}']") is not None

270

271

272def reading_has_witness(reading:Element, siglum:str) -> bool:

273 if 'wit' not in reading.attrib:

274 return False

275

276 witnesses = reading.attrib['wit'].split()

277 return (siglum in witnesses or f"#{siglum}" in witnesses)

278

279

280def add_wit_detail(apps:Element|set[Element], siglum:str, note:str="", phrase:str="", phrase_lang:str="", resp_id:str="VorlageLLM") -> None:

281 if isinstance(apps, Element):

282 apps = [apps]

283 for app in apps:

284 wit_detail = ET.SubElement(app, "witDetail", wit=siglum, resp=f"#{resp_id}")

285 if phrase:

286 phrase_element = ET.SubElement(wit_detail, "phr")

287 phrase_element.text = phrase

288 if phrase_lang:

289 phrase_element.attrib['{http://www.w3.org/XML/1998/namespace}lang'] = phrase_lang

290 if note:

291 ET.SubElement(wit_detail, "note").text = note

292

293

294def find_parent(element:Element, tag:str) -> Element|None:

295 """

296 Finds the nearest ancestor of the given element with the specified tag.

297

298 Args:

299 element (Element): The starting XML element from which to search upward.

300 tag (str): The tag name of the ancestor element to find.

301

302 Returns:

303 Optional[Element]: The nearest ancestor element with the specified tag, or None if no such element is found.

304

305 Example:

306 >>> from xml.etree.ElementTree import Element

307 >>> root = Element('root')

308 >>> ab = Element('ab')

309 >>> section = Element('section')

310 >>> target = Element('target')

311 >>> root.append(ab)

312 >>> ab.append(section)

313 >>> section.append(target)

314 >>> result = find_parent(target, 'ab')

315 >>> assert result == ab

316

317 This will find the <ab> ancestor of the <target> element.

318 """

319 while element is not None:

320 element_tag = re.sub(r"{.*}", "", element.tag)

321 if element_tag == tag:

322 return element

323 element = element.getparent()

324 return None

325

326

327def strip_namespace(element: Element) -> Element:

328 """Remove namespace from an element and its children."""

329 element.tag = element.tag.split('}', 1)[-1] # Remove namespace

330 for elem in element.iter():

331 if isinstance(elem, str):

332 elem.tag = elem.tag.split('}', 1)[-1] # Remove namespace

333 return element

334

335

336def write_elements(elements:list[Element], output_file:Path, root_tag:str="body", **kwargs) -> None:

337 """

338 Writes a list of XML elements to a file, wrapping them in a specified root element.

339

340 Args:

341 elements (list[Element]): List of XML elements to be written.

342 output_file (Path): Path object specifying the file where the XML will be written.

343 root_tag (str): Tag name for the root element that will wrap the elements. Defaults to "body".

344

345 Returns:

346 None: This function does not return any value. It writes the XML structure to the specified file.

347 """

348 root = new_element(root_tag, **kwargs)

349 for element in elements:

350 if element is not None:

351 root.append(strip_namespace(element))

352

353 tree = new_element_tree(root)

354 write_tei(tree, output_file)

355

356

357def get_apparatus_verse_text(app:Element, witness:str="") -> str:

358 parent = find_parent(app, 'ab')

359 text = parent.text or ""

360 text = text.strip()

361 text += " "

362 for child in parent:

363 if isinstance(child.tag, str):

364 tag = re.sub(r"{.*}", "", child.tag)

365 else:

366 continue

367

368 if tag in ["pc", "witDetail", "note"]:

369 continue

370

371 if tag == "app":

372 if witness and app_has_witness(child, witness):

373 for reading in find_elements(child, ".//rdg"):

374 if reading_has_witness(reading, witness):

375 lemma = reading

376 break

377 else:

378 lemma = find_element(child, ".//lem")

379 if lemma is None:

380 lemma = find_element(child, ".//rdg")

381 if lemma is None:

382 lemma = app

383 app_text = extract_text(lemma, include_tail=False) or ""

384 app_text = app_text.strip()

385 if child == app:

386 app_text = f"⸂{app_text}⸃"

387 text += app_text

388 if app.tail:

389 text += " " + app.tail.strip()

390 else:

391 child_text = extract_text(child) or ""

392 child_text = child_text.strip()

393 text += child_text or ""

394

395 text += " "

396

397 text += parent.tail or ""

398 text = re.sub(r"\s+", " ", text.strip())

399 return text

400

401

402def extract_text(node:Element, include_tail:bool=True, strip:bool=True) -> str:

403 if node is None:

404 return ""

405

406 if isinstance(node.tag, str):

407 tag = re.sub(r"{.*}", "", node.tag)

408 else:

409 return ""

410

411 if tag in ["pc", "witDetail", "note"]:

412 return ""

413 if tag == "app":

414 lemma = find_element(node, ".//lem")

415 if lemma is None:

416 lemma = find_element(node, ".//rdg")

417 if lemma:

418 return extract_text(lemma, strip=False) or ""

419 if tag == "ref":

420 root = node.getroottree().getroot()

421 target_id = node.attrib['target'].lstrip("#")

422 ns = {"tei": "http://www.tei-c.org/ns/1.0"}

423 target = root.xpath(f"//*[@xml:id='{target_id}']", namespaces=ns)

424

425 if target:

426 return extract_text(target[0], strip=strip)

427

428 text = node.text or ""

429 for child in node:

430 text += extract_text(child, strip=False)

431

432 if include_tail and node.tail:

433 text += node.tail

434

435 text = re.sub(r"\s+", " ", text)

436

437 if tag == "w" or (tag == "lb" and node.attrib.get("break", "").lower() != "no"):

438 text += " "

439

440 if strip:

441 text = text.strip()

442

443 return text

444

445

446def readings_for_witness(app:Element, siglum:str) -> set[Element]:

447 """

448 Collects readings associated with a specific witness from an XML apparatus entry.

449

450 Args:

451 app (Element): The XML element representing the apparatus entry.

452 siglum (str): The siglum of the witness to filter readings by.

453

454 Returns:

455 set[Element]: A set of reading elements that include the specified witness.

456 """

457 readings = find_elements(app, ".//rdg")

458 return set(reading for reading in readings if reading_has_witness(reading, siglum))

459

460

461def add_doc_metadata(witness_element:Element, doc:ElementTree) -> Element:

462 """

463 Adds metadata from the `doc` element tree to the `witness_element`.

464

465 This function searches for a `biblFull` element within the given `witness_element`.

466 If it does not exist, a new `biblFull` element is created. It then copies all children

467 from the `fileDesc` element in the `doc` element tree and appends them to the `biblFull` element.

468

469 Args:

470 witness_element (Element): The XML element representing the witness to which metadata will be added.

471 doc (ElementTree): The XML element tree containing the source metadata.

472

473 Returns:

474 Element: The `biblFull` element within the `witness_element` containing the appended metadata.

475 """

476 bibl_full = find_element(witness_element, ".//biblFull")

477 if bibl_full is None:

478 bibl_full = ET.SubElement(witness_element, "biblFull")

479 file_description = find_element(doc, ".//fileDesc")

480 if file_description is not None:

481 for child in file_description:

482 new_child = copy.deepcopy(child)

483 bibl_full.append(new_child)

484 return bibl_full

485

486

487def add_responsibility_statement(doc:ElementTree, xml_id:str, description:str) -> tuple[Element,str]:

488 """

489 Adds a responsibility statement to the XML document.

490

491 Args:

492 doc (ElementTree): The XML document to which the responsibility statement will be added.

493 siglum (str): The siglum of the witness.

494 description (str): The description of the responsibility statement.

495

496 Returns:

497 Element: The responsibility statement element that was added to the document.

498 str: The unique ID of the responsibility statement.

499 """

500 if isinstance(doc, ElementTree):

501 doc = doc.getroot()

502

503 header = find_element(doc, ".//teiHeader")

504 if header is None:

505 header = ET.SubElement(doc, "teiHeader")

506

507 file_description = find_element(header, ".//fileDesc")

508 if file_description is None:

509 file_description = ET.SubElement(header, "fileDesc")

510

511 title_statement = find_element(file_description, ".//titleStmt")

512 if title_statement is None:

513 title_statement = ET.SubElement(file_description, "titleStmt")

514

515 # Get unique ID

516 counter = 1

517 while find_element(title_statement, f".//respStmt[@{{http://www.w3.org/XML/1998/namespace}}id='{xml_id}']") is not None:

518 counter += 1

519 xml_id = f"VorlageLLM-{counter}"

520

521 responsibility_statement = ET.SubElement(title_statement, "respStmt", )

522 responsibility_statement.attrib['{http://www.w3.org/XML/1998/namespace}id'] = xml_id

523

524 # Get datetime in required format

525 current_time = datetime.now()

526 formatted_time = current_time.strftime('%Y-%m-%dT%H:%M:%S')

527

528 resp = ET.SubElement(responsibility_statement, "resp", when=formatted_time)

529 resp.text = description

530

531 return responsibility_statement, xml_id

532

533

534def add_responsibility_statement_llm(doc:ElementTree, siglum:str, model_id:str) -> tuple[Element,str]:

535 """

536 Adds a responsibility statement to the XML document.

537

538 Args:

539 doc (ElementTree): The XML document to which the responsibility statement will be added.

540 siglum (str): The siglum of the witness.

541 model_id (str): The ID of the LLM model used.

542

543 Returns:

544 Element: The responsibility statement element that was added to the document.

545 str: The unique ID of the responsibility statement.

546 """

547 description = f"Witness '{siglum}' added using VorlageLLM using LLM '{model_id}'"

548 xml_id = f"VorlageLLM-{siglum}-{model_id}"

549 return add_responsibility_statement(doc, xml_id, description)