Coverage for vorlagellm/ensemble.py: 10.20%
49 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-10-24 03:22 +0000
1from lxml import etree as ET
2from lxml.etree import _ElementTree as ElementTree
3from rich.progress import track
5from .tei import (
6 find_elements,
7 extract_text,
8 reading_has_witness,
9 add_witness_readings,
10 remove_witnesss_readings,
11 find_element,
12 add_responsibility_statement,
13)
15def do_ensemble(xml_files:list[ElementTree], witness:str) -> ElementTree:
16 assert len(xml_files) >= 2, f"Needs multiple apparatus objects to perform ensemble"
17 apparatus_apps_list = [find_elements(apparatus, ".//app") for apparatus in xml_files]
19 # Add responsibility statements from other files
20 collation_xml_file = xml_files[0]
21 title_stmt = find_element(collation_xml_file, ".//titleStmt")
22 for xml_file in xml_files[1:]:
23 for resp_stmt in find_elements(xml_file, ".//respStmt"):
24 title_stmt.append(resp_stmt)
26 # Add VorlageLLM Ensemble information into the TEI header and include all the relevant information about each apparatus
27 _, responsibility_statement_id = add_responsibility_statement(
28 xml_files[0],
29 "VorlageLLM-Ensemble",
30 f"Ensembled from {len(xml_files)} files using VorlageLLM.",
31 )
33 # Make sure that each apparatus has the same number of readings
34 apps_count = None
35 for app_list in apparatus_apps_list:
36 if apps_count is None:
37 apps_count = len(app_list)
38 else:
39 assert apps_count == len(app_list), f"Each apparatus must have the same number of <app> elements, expected {apps_count} and found {len(app_list)}"
41 for app_in_each_file in track(zip(*apparatus_apps_list), total=apps_count, description="Ensembling <app> elements"):
42 # for app_in_each_file in zip(*apparatus_apps_list):
43 assert len(app_in_each_file) == len(xml_files)
44 readings_list = [find_elements(app, ".//rdg") for app in app_in_each_file]
46 # Make sure that each apparatus has the same number of readings
47 readings_count = None
48 for readings in readings_list:
49 if readings_count is None:
50 readings_count = len(readings)
51 else:
52 assert readings_count == len(readings), f"Each apparatus must have the same number of <rdg> elements in each <app>, expected {readings_count} and found {len(readings)}"
54 for readings in zip(*readings_list):
55 assert len(readings) == len(xml_files)
56 readings_with_witness = 0
58 # Make sure that each reading is the same
59 readings_text = None
60 for reading in readings:
61 if readings_text is None:
62 readings_text = extract_text(reading)
63 else:
64 assert readings_text == extract_text(reading)
66 readings_with_witness += int(reading_has_witness(reading, witness))
68 majority_has_witness = (2 * readings_with_witness >= len(xml_files))
70 # modify first apparatus
71 reading = readings[0]
72 if majority_has_witness == reading_has_witness(reading, witness):
73 continue
74 elif majority_has_witness:
75 add_witness_readings(reading, witness)
76 else:
77 remove_witnesss_readings(reading, witness)
79 # Find witDetail element in each apparatus
80 wit_details = [find_element(app, f".//witDetail[@wit='" + witness + "']") for app in app_in_each_file]
82 # Create new witDetail in first apparatus
83 ensenble_app = app_in_each_file[0]
84 ensemble_wit_detail = ET.SubElement(ensenble_app, "witDetail", wit=witness, resp=responsibility_statement_id)
85 for wit_detail in wit_details:
86 if wit_detail is not None:
87 ensemble_wit_detail.append(wit_detail)
89 return xml_files[0]