Coverage for vorlagellm/ensemble.py: 10.20%

49 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-10-24 03:22 +0000

1from lxml import etree as ET 

2from lxml.etree import _ElementTree as ElementTree 

3from rich.progress import track 

4 

5from .tei import ( 

6 find_elements, 

7 extract_text, 

8 reading_has_witness, 

9 add_witness_readings, 

10 remove_witnesss_readings, 

11 find_element, 

12 add_responsibility_statement, 

13) 

14 

15def do_ensemble(xml_files:list[ElementTree], witness:str) -> ElementTree: 

16 assert len(xml_files) >= 2, f"Needs multiple apparatus objects to perform ensemble" 

17 apparatus_apps_list = [find_elements(apparatus, ".//app") for apparatus in xml_files] 

18 

19 # Add responsibility statements from other files 

20 collation_xml_file = xml_files[0] 

21 title_stmt = find_element(collation_xml_file, ".//titleStmt") 

22 for xml_file in xml_files[1:]: 

23 for resp_stmt in find_elements(xml_file, ".//respStmt"): 

24 title_stmt.append(resp_stmt) 

25 

26 # Add VorlageLLM Ensemble information into the TEI header and include all the relevant information about each apparatus 

27 _, responsibility_statement_id = add_responsibility_statement( 

28 xml_files[0], 

29 "VorlageLLM-Ensemble", 

30 f"Ensembled from {len(xml_files)} files using VorlageLLM.", 

31 ) 

32 

33 # Make sure that each apparatus has the same number of readings 

34 apps_count = None 

35 for app_list in apparatus_apps_list: 

36 if apps_count is None: 

37 apps_count = len(app_list) 

38 else: 

39 assert apps_count == len(app_list), f"Each apparatus must have the same number of <app> elements, expected {apps_count} and found {len(app_list)}" 

40 

41 for app_in_each_file in track(zip(*apparatus_apps_list), total=apps_count, description="Ensembling <app> elements"): 

42 # for app_in_each_file in zip(*apparatus_apps_list): 

43 assert len(app_in_each_file) == len(xml_files) 

44 readings_list = [find_elements(app, ".//rdg") for app in app_in_each_file] 

45 

46 # Make sure that each apparatus has the same number of readings 

47 readings_count = None 

48 for readings in readings_list: 

49 if readings_count is None: 

50 readings_count = len(readings) 

51 else: 

52 assert readings_count == len(readings), f"Each apparatus must have the same number of <rdg> elements in each <app>, expected {readings_count} and found {len(readings)}" 

53 

54 for readings in zip(*readings_list): 

55 assert len(readings) == len(xml_files) 

56 readings_with_witness = 0 

57 

58 # Make sure that each reading is the same 

59 readings_text = None 

60 for reading in readings: 

61 if readings_text is None: 

62 readings_text = extract_text(reading) 

63 else: 

64 assert readings_text == extract_text(reading) 

65 

66 readings_with_witness += int(reading_has_witness(reading, witness)) 

67 

68 majority_has_witness = (2 * readings_with_witness >= len(xml_files)) 

69 

70 # modify first apparatus 

71 reading = readings[0] 

72 if majority_has_witness == reading_has_witness(reading, witness): 

73 continue 

74 elif majority_has_witness: 

75 add_witness_readings(reading, witness) 

76 else: 

77 remove_witnesss_readings(reading, witness) 

78 

79 # Find witDetail element in each apparatus 

80 wit_details = [find_element(app, f".//witDetail[@wit='" + witness + "']") for app in app_in_each_file] 

81 

82 # Create new witDetail in first apparatus 

83 ensenble_app = app_in_each_file[0] 

84 ensemble_wit_detail = ET.SubElement(ensenble_app, "witDetail", wit=witness, resp=responsibility_statement_id) 

85 for wit_detail in wit_details: 

86 if wit_detail is not None: 

87 ensemble_wit_detail.append(wit_detail) 

88 

89 return xml_files[0] 

90