Coverage for msstools/count.py: 100.00%

172 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2026-05-13 23:05 +0000

1from collections import Counter, defaultdict 

2from pathlib import Path 

3import re 

4import regex 

5import numpy as np 

6import matplotlib.pyplot as plt 

7import svgwrite 

8 

9 

10def greek_char_count(string:str) -> int: 

11 """Counts the number of Greek characters in a given string.""" 

12 chars = regex.findall(r'\p{IsGreek}', string) 

13 return len(chars) 

14 

15 

16def try_open_file( filenames ): 

17 broken = [] 

18 for filename in filenames: 

19 try: 

20 f = open(filename, 'r') 

21 return f, Path(filename).name 

22 except: 

23 broken.append(filename) 

24 return None, '' 

25 

26 

27def count_greek_chars( 

28 filename_prefix:str, 

29 start_homily:int=0, 

30 end_homily:int=32, 

31 warning_stdev:float = 1.8, 

32 output_path:Path|None = None, 

33 show:bool = False, 

34): 

35 page_char_counts = Counter() 

36 page_to_file_dict = defaultdict(list) 

37 

38 current_page = "Unk" 

39 current_folio = None 

40 current_side = None 

41 for homily_index in range(start_homily, end_homily + 1): 

42 filename = f"{filename_prefix}{homily_index}" 

43 filename_leadingzero = f"{filename_prefix}0{homily_index}" 

44 

45 f, current_filename = try_open_file( [ filename, filename_leadingzero, filename + ".txt", filename_leadingzero + ".txt"]) 

46 if not f: 

47 print("Cannot open", filename) 

48 continue 

49 

50 for line in f: 

51 line = line.strip() 

52 

53 # Check for change of page, e.g.: 

54 # |F 71bv| 

55 match = regex.match(r"\|F (\d+)([vrabcp]+)\|", line) 

56 if match: 

57 folio = match.group(1) 

58 side = match.group(2) 

59 page = folio + side 

60 

61 if current_folio: 

62 if folio != current_folio and int(folio) != int(current_folio) + 1: 

63 print(f"Folio error from {current_page} to {page} in file {current_filename} ?") 

64 if folio != current_folio and side == current_side and side != 'p': 

65 print(f"Folio side error from {current_page} to {page} in file {current_filename} ?") 

66 elif folio != current_folio and side != 'r' and side != 'p': 

67 print(f"Folio side error from {current_page} to {page} in file {current_filename} ?") 

68 

69 current_page = page 

70 current_folio = folio 

71 current_side = side 

72 page_to_file_dict[current_page] = current_filename 

73 

74 char_count = greek_char_count(line) 

75 if char_count: 

76 page_char_counts[current_page] += greek_char_count(line) 

77 

78 assert len(page_char_counts), f'No pages found in files with prefix {filename_prefix}' 

79 

80 fig, ax = plt.subplots(figsize=(20,10)) 

81 

82 vals = list(page_char_counts.values()) 

83 mean = np.mean(vals) 

84 std = np.std(vals) 

85 print(f"Mean: {mean:.1f}") 

86 print(f"Standard Deviation: {std:.1f}") 

87 

88 

89 print("Outlier Pages:") 

90 warning_labels = [] 

91 for item in page_char_counts: 

92 if page_char_counts[item] > mean + warning_stdev*std or page_char_counts[item] < mean - warning_stdev*std: 

93 print(item, page_char_counts[item], page_to_file_dict[item], sep='\t\t') 

94 warning_labels.append( item ) 

95 

96 labels, values = zip(*page_char_counts.items()) 

97 

98 warning_annotations = [] 

99 for index, label in enumerate(labels): 

100 if label in warning_labels: 

101 warning_annotations.append(label) 

102 else: 

103 warning_annotations.append("") 

104 

105 indexes = np.arange(len(labels)) 

106 ax.scatter(indexes, values, marker='o', edgecolor='red', facecolor='#00000000', linewidths=1) 

107 

108 for outlier in warning_labels: 

109 index = labels.index(outlier) 

110 ax.annotate(outlier, (indexes[index], values[index])) 

111 

112 ax.set_ylabel("Greek characters on folio side", horizontalalignment='right', y=1.0) 

113 ax.set_xlabel("Folio side", horizontalalignment='right', x=1.0) 

114 ax.tick_params(axis="x", bottom=False, labelbottom=False) 

115 for index, label in enumerate(labels): 

116 if index % 20 == 0: 

117 ax.text( 

118 indexes[index], 

119 -0.02, 

120 label, 

121 ha="right", 

122 va="top", 

123 rotation=90, 

124 transform=ax.get_xaxis_transform(), 

125 ) 

126 

127 if show or output_path is None: 

128 plt.show() 

129 

130 if output_path: 

131 output_path = Path(output_path) 

132 if not output_path.parent.exists(): 

133 output_path.parent.mkdir(parents=True, exist_ok=True) 

134 fig.savefig(output_path, bbox_inches='tight') 

135 print("Saved plot to:", output_path) 

136 

137 plt.close(fig) 

138 

139 

140 

141def read_sentence_counts( filename_prefix, start_homily = 0, end_homily = 32 ): 

142 sentence_counts = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) 

143 

144 for homily_index in range(start_homily, end_homily+1): 

145 homily_index = int(homily_index) 

146 filename = f"{filename_prefix}{homily_index}" 

147 filename_leadingzero = f"{filename_prefix}0{homily_index}" 

148 

149 f, _ = try_open_file( [ filename, filename_leadingzero, filename + ".txt", filename_leadingzero + ".txt"]) 

150 if not f: 

151 print("Cannot open", filename) 

152 continue 

153 

154 data = f.read() 

155 

156 paragraphs = re.findall(r"\<P ([0-9]+)\>(.*?)\<\/P\>", data, re.MULTILINE|re.DOTALL) 

157 for paragraph in paragraphs: 

158 paragraph_number = int(paragraph[0]) 

159 paragraph_text = paragraph[1] 

160 

161 sentences = re.findall(r"\<S ([0-9]+)\>(.*?)\<\/S\>", paragraph_text, re.MULTILINE|re.DOTALL) 

162 for sentence in sentences: 

163 sentence_number = int(sentence[0]) 

164 sentence_counts[homily_index][paragraph_number][sentence_number] = greek_char_count(sentence[1].strip()) 

165 

166 return sentence_counts 

167 

168 

169def count_total( sentence_counts ) -> int: 

170 count = 0 

171 for h in sentence_counts: 

172 for p in sentence_counts[h]: 

173 for s in sentence_counts[h][p]: 

174 count +=1 

175 return count 

176 

177 

178def compare_dictionaries(sentence_counts_base, sentence_counts_comparison, threshold): 

179 for h in sentence_counts_base: 

180 if h not in sentence_counts_comparison: 

181 print(f"Homily {h} not found in comparison text.") 

182 continue 

183 for p in sentence_counts_base[h]: 

184 if p not in sentence_counts_comparison[h]: 

185 print(f"Paragraph {h}.{p} not found in comparison text.") 

186 continue 

187 for s in sentence_counts_base[h][p]: 

188 if s not in sentence_counts_comparison[h][p]: 

189 print(f"Sentence {h}.{p}.{s} not found in comparison text.") 

190 continue 

191 # ignore if the base text is empty  

192 if sentence_counts_base[h][p][s] == 0: 

193 continue 

194 if sentence_counts_comparison[h][p][s] == 0: 

195 print(f"Sentence {h}.{p}.{s} is empty in comparison text.") 

196 

197 if sentence_counts_comparison[h][p][s] > sentence_counts_base[h][p][s] + threshold: 

198 print(f"Sentence {h}.{p}.{s} above the threshold.") 

199 

200 

201def write_square( dwg, position, colour, size=1 , height=10): 

202 dwg.add(dwg.rect( (position*size, 0), ( (position+1) * size, height ), fill=colour )) 

203 

204 

205def svg_dictionaries( sentence_counts_base, sentence_counts_comparison, threshold, filename, size=1, height=10): 

206 """Write the comparison of two sentence counts to an SVG file.""" 

207 print("Writing SVG file:", filename) 

208 

209 count = count_total(sentence_counts_base) 

210 dwg = svgwrite.Drawing(filename, size=(count*size,height), profile='tiny') 

211 

212 position = 0 

213 for h in sentence_counts_base: 

214 for p in sentence_counts_base[h]: 

215 for s in sentence_counts_base[h][p]: 

216 # ignore if the base text is empty  

217 if sentence_counts_base[h][p][s] == 0: 

218 continue 

219 colour = "" 

220 if h not in sentence_counts_comparison or p not in sentence_counts_comparison[h] or s not in sentence_counts_comparison[h][p]: 

221 colour = "black" 

222 elif sentence_counts_comparison[h][p][s] == 0: 

223 colour = "red" 

224 elif sentence_counts_comparison[h][p][s] > sentence_counts_base[h][p][s] + threshold: 

225 colour = "blue" 

226 else: 

227 colour = "green" 

228 

229 write_square( dwg, position, colour, size) 

230 position += size 

231 

232 dwg.save() 

233 

234 

235def compare_counts(base_prefix:str, comparison_prefix:str, output_svg:Path=None, start_homily:int=0, end_homily=32, threshold:int=50): 

236 print("Reading Base:") 

237 sentence_counts_base = read_sentence_counts(base_prefix, start_homily, end_homily) 

238 

239 count = count_total(sentence_counts_base) 

240 print('Base Sentence Count:', count) 

241 

242 print("Reading Comparison:") 

243 sentence_counts_comparison = read_sentence_counts(comparison_prefix, start_homily, end_homily) 

244 

245 print("Checking:") 

246 compare_dictionaries(sentence_counts_base, sentence_counts_comparison, threshold) 

247 

248 if output_svg: 

249 svg_dictionaries(sentence_counts_base, sentence_counts_comparison, threshold, output_svg)