Coverage for msstools/count.py: 100.00%
172 statements
« prev ^ index » next coverage.py v7.9.1, created at 2026-05-13 23:05 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2026-05-13 23:05 +0000
1from collections import Counter, defaultdict
2from pathlib import Path
3import re
4import regex
5import numpy as np
6import matplotlib.pyplot as plt
7import svgwrite
10def greek_char_count(string:str) -> int:
11 """Counts the number of Greek characters in a given string."""
12 chars = regex.findall(r'\p{IsGreek}', string)
13 return len(chars)
16def try_open_file( filenames ):
17 broken = []
18 for filename in filenames:
19 try:
20 f = open(filename, 'r')
21 return f, Path(filename).name
22 except:
23 broken.append(filename)
24 return None, ''
27def count_greek_chars(
28 filename_prefix:str,
29 start_homily:int=0,
30 end_homily:int=32,
31 warning_stdev:float = 1.8,
32 output_path:Path|None = None,
33 show:bool = False,
34):
35 page_char_counts = Counter()
36 page_to_file_dict = defaultdict(list)
38 current_page = "Unk"
39 current_folio = None
40 current_side = None
41 for homily_index in range(start_homily, end_homily + 1):
42 filename = f"{filename_prefix}{homily_index}"
43 filename_leadingzero = f"{filename_prefix}0{homily_index}"
45 f, current_filename = try_open_file( [ filename, filename_leadingzero, filename + ".txt", filename_leadingzero + ".txt"])
46 if not f:
47 print("Cannot open", filename)
48 continue
50 for line in f:
51 line = line.strip()
53 # Check for change of page, e.g.:
54 # |F 71bv|
55 match = regex.match(r"\|F (\d+)([vrabcp]+)\|", line)
56 if match:
57 folio = match.group(1)
58 side = match.group(2)
59 page = folio + side
61 if current_folio:
62 if folio != current_folio and int(folio) != int(current_folio) + 1:
63 print(f"Folio error from {current_page} to {page} in file {current_filename} ?")
64 if folio != current_folio and side == current_side and side != 'p':
65 print(f"Folio side error from {current_page} to {page} in file {current_filename} ?")
66 elif folio != current_folio and side != 'r' and side != 'p':
67 print(f"Folio side error from {current_page} to {page} in file {current_filename} ?")
69 current_page = page
70 current_folio = folio
71 current_side = side
72 page_to_file_dict[current_page] = current_filename
74 char_count = greek_char_count(line)
75 if char_count:
76 page_char_counts[current_page] += greek_char_count(line)
78 assert len(page_char_counts), f'No pages found in files with prefix {filename_prefix}'
80 fig, ax = plt.subplots(figsize=(20,10))
82 vals = list(page_char_counts.values())
83 mean = np.mean(vals)
84 std = np.std(vals)
85 print(f"Mean: {mean:.1f}")
86 print(f"Standard Deviation: {std:.1f}")
89 print("Outlier Pages:")
90 warning_labels = []
91 for item in page_char_counts:
92 if page_char_counts[item] > mean + warning_stdev*std or page_char_counts[item] < mean - warning_stdev*std:
93 print(item, page_char_counts[item], page_to_file_dict[item], sep='\t\t')
94 warning_labels.append( item )
96 labels, values = zip(*page_char_counts.items())
98 warning_annotations = []
99 for index, label in enumerate(labels):
100 if label in warning_labels:
101 warning_annotations.append(label)
102 else:
103 warning_annotations.append("")
105 indexes = np.arange(len(labels))
106 ax.scatter(indexes, values, marker='o', edgecolor='red', facecolor='#00000000', linewidths=1)
108 for outlier in warning_labels:
109 index = labels.index(outlier)
110 ax.annotate(outlier, (indexes[index], values[index]))
112 ax.set_ylabel("Greek characters on folio side", horizontalalignment='right', y=1.0)
113 ax.set_xlabel("Folio side", horizontalalignment='right', x=1.0)
114 ax.tick_params(axis="x", bottom=False, labelbottom=False)
115 for index, label in enumerate(labels):
116 if index % 20 == 0:
117 ax.text(
118 indexes[index],
119 -0.02,
120 label,
121 ha="right",
122 va="top",
123 rotation=90,
124 transform=ax.get_xaxis_transform(),
125 )
127 if show or output_path is None:
128 plt.show()
130 if output_path:
131 output_path = Path(output_path)
132 if not output_path.parent.exists():
133 output_path.parent.mkdir(parents=True, exist_ok=True)
134 fig.savefig(output_path, bbox_inches='tight')
135 print("Saved plot to:", output_path)
137 plt.close(fig)
141def read_sentence_counts( filename_prefix, start_homily = 0, end_homily = 32 ):
142 sentence_counts = defaultdict( lambda: defaultdict(lambda: defaultdict(int)))
144 for homily_index in range(start_homily, end_homily+1):
145 homily_index = int(homily_index)
146 filename = f"{filename_prefix}{homily_index}"
147 filename_leadingzero = f"{filename_prefix}0{homily_index}"
149 f, _ = try_open_file( [ filename, filename_leadingzero, filename + ".txt", filename_leadingzero + ".txt"])
150 if not f:
151 print("Cannot open", filename)
152 continue
154 data = f.read()
156 paragraphs = re.findall(r"\<P ([0-9]+)\>(.*?)\<\/P\>", data, re.MULTILINE|re.DOTALL)
157 for paragraph in paragraphs:
158 paragraph_number = int(paragraph[0])
159 paragraph_text = paragraph[1]
161 sentences = re.findall(r"\<S ([0-9]+)\>(.*?)\<\/S\>", paragraph_text, re.MULTILINE|re.DOTALL)
162 for sentence in sentences:
163 sentence_number = int(sentence[0])
164 sentence_counts[homily_index][paragraph_number][sentence_number] = greek_char_count(sentence[1].strip())
166 return sentence_counts
169def count_total( sentence_counts ) -> int:
170 count = 0
171 for h in sentence_counts:
172 for p in sentence_counts[h]:
173 for s in sentence_counts[h][p]:
174 count +=1
175 return count
178def compare_dictionaries(sentence_counts_base, sentence_counts_comparison, threshold):
179 for h in sentence_counts_base:
180 if h not in sentence_counts_comparison:
181 print(f"Homily {h} not found in comparison text.")
182 continue
183 for p in sentence_counts_base[h]:
184 if p not in sentence_counts_comparison[h]:
185 print(f"Paragraph {h}.{p} not found in comparison text.")
186 continue
187 for s in sentence_counts_base[h][p]:
188 if s not in sentence_counts_comparison[h][p]:
189 print(f"Sentence {h}.{p}.{s} not found in comparison text.")
190 continue
191 # ignore if the base text is empty
192 if sentence_counts_base[h][p][s] == 0:
193 continue
194 if sentence_counts_comparison[h][p][s] == 0:
195 print(f"Sentence {h}.{p}.{s} is empty in comparison text.")
197 if sentence_counts_comparison[h][p][s] > sentence_counts_base[h][p][s] + threshold:
198 print(f"Sentence {h}.{p}.{s} above the threshold.")
201def write_square( dwg, position, colour, size=1 , height=10):
202 dwg.add(dwg.rect( (position*size, 0), ( (position+1) * size, height ), fill=colour ))
205def svg_dictionaries( sentence_counts_base, sentence_counts_comparison, threshold, filename, size=1, height=10):
206 """Write the comparison of two sentence counts to an SVG file."""
207 print("Writing SVG file:", filename)
209 count = count_total(sentence_counts_base)
210 dwg = svgwrite.Drawing(filename, size=(count*size,height), profile='tiny')
212 position = 0
213 for h in sentence_counts_base:
214 for p in sentence_counts_base[h]:
215 for s in sentence_counts_base[h][p]:
216 # ignore if the base text is empty
217 if sentence_counts_base[h][p][s] == 0:
218 continue
219 colour = ""
220 if h not in sentence_counts_comparison or p not in sentence_counts_comparison[h] or s not in sentence_counts_comparison[h][p]:
221 colour = "black"
222 elif sentence_counts_comparison[h][p][s] == 0:
223 colour = "red"
224 elif sentence_counts_comparison[h][p][s] > sentence_counts_base[h][p][s] + threshold:
225 colour = "blue"
226 else:
227 colour = "green"
229 write_square( dwg, position, colour, size)
230 position += size
232 dwg.save()
235def compare_counts(base_prefix:str, comparison_prefix:str, output_svg:Path=None, start_homily:int=0, end_homily=32, threshold:int=50):
236 print("Reading Base:")
237 sentence_counts_base = read_sentence_counts(base_prefix, start_homily, end_homily)
239 count = count_total(sentence_counts_base)
240 print('Base Sentence Count:', count)
242 print("Reading Comparison:")
243 sentence_counts_comparison = read_sentence_counts(comparison_prefix, start_homily, end_homily)
245 print("Checking:")
246 compare_dictionaries(sentence_counts_base, sentence_counts_comparison, threshold)
248 if output_svg:
249 svg_dictionaries(sentence_counts_base, sentence_counts_comparison, threshold, output_svg)