generated from LPBeaulieu/Typewriter-OCR-TintypeText
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathe-braille-tales.py
2412 lines (2286 loc) · 170 KB
/
e-braille-tales.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import cv2
import os
import shutil
import re
from alive_progress import alive_bar
import numpy as np
from fastai.vision.all import *
#Clear the command line screen
os.system('clear')
cwd = os.getcwd()
#Should a text file name be provided for a modified braille text file
#(found within the "OCR Raw Data" subfolder of the current working folder,
#ex: python3 e-braille-tales.py "my_file.txt") the OCR code will be
#skipped altogether and only the writing of the Portable Embosser Format
#(PEF) file and transcription to printed English (RTF document) will be performed.
#If no other argument is provided, the code will carry on with the OCR step
#outlined in the "if len(sys.argv) == 1" statement below.
if len(sys.argv) == 1:
#This function extracts the x and y coordinates for every character in a JPEG image of
#scanned braille text written using a Perkins Brailler, at 300 dpi resolution and with
#the smallest possible left margin on a 8 1/2" by 11" page typed in landscape mode.
#Importantly, the page must be scanned with the left margin placed in such a way that the
#shadows produced by the scanner light will face away from the left margin (the shadows will
#face the right margin of the page, then the page is viewed in landscape mode). This is
#because the non-white pixels actually result from the presence of shadows, the orientation
#of which plays a major role in image segmentation (determining the x and y coordinates
#of the individual characters) and optical character recognition (OCR). For best results,
#the braille document should be typed on white braille paper or cardstock and scanned as
#grayscale images on a flatbed scanner at a 300 dpi resolution with the paper size setting
#of the scanner set to letter 8 1/2" x 11" (A4). The darkness settings of the scanner might
#also need to be adjusted to acheive an optimal braille shadow to noise ratio.
def get_character_x_y_coordinates(image):
#Since the braille pages are typed in landscape mode the "imgheight" is actually
#the width of the page (shape[1]) as it would be layed out for reading.
imgheight=image.shape[1]
imgwidth=image.shape[0]
lines_y_min_maxes = [[]]
#The image's numpy array is filtered using the np.where() function to convert
#white pixels (255) to 0 and non-white pixels (!= 255) to 1. The rows are added
#up (summation along the 0 axis, as the page is in landscape mode) to determine
#how many non-white pixels there are along the x axis for a given y coordinate.
#The y coordinates where the number of non-white pixels exceed 30 are extracted
#using the np.where() function, which gives the indices (y pixel numbers) in the
#original version of the array y_pixels where there are at least 30 non white pixels,
#along the x axis.
image_filtered = np.where(image == 255, 0, 1)
#The cutoff corresponds to the required amount of non-white pixels after summation
#of the values along the x axis, for a given y coordinate to be included in the numpy
#array "y_pixels". As the non-white pixels actually result from the shadows of the
#embossed braille characters, the segmentation results vary greatly from scanner to
#scanner. The optimization of the cutoff allows for the segmentation to be more
#generalizable. For best segmentation results, this process is repeated for every JPEG
#image in the list "JPEG_file_names".
cutoff_results = []
for cutoff in range(30,300,10):
#Here axis 0 is used for the "y_pixels", as the page was written in landscape mode,
#but scanned in portrait mode. The sum of all x pixels (in landscape mode) for a
#given y pixel (in landscape mode) is made for every y pixel within the image.
#White x pixels will have a value of 0 and non white pixels will each have a value of 1.
y_pixels = np.sum(image_filtered, axis=0)
y_pixels = np.where(y_pixels > cutoff)[0]
lines_y_min_maxes = []
for k in range(len(y_pixels)):
#If the difference in pixel numbers between the curent and the next
#y pixel in the "y_pixels" array is over 15 pixels (indicating a new line,
#as dots within a braille cell would be within 15 pixels), then the line
#is considered for inclusion in the "lines_y_min_maxes" list. As all braille
#cells have the same height (around 90 pixels), the minimum y value can be found
#by subtracting "character_height" from the pixel number of the current
#y pixel ("y_pixels[k]").
if k < len(y_pixels)-1 and y_pixels[k+1] - y_pixels[k] > 15:
#The "if" statement excludes detected lines if they are overlapping with
#the upper or lower borders of the page. The scanned image might have darkened
#edges, as the scanner plate is larger than the page. Also, the "if" statement
#excludes overlapping lines (of which the difference between the y minima is less
#than "character_height"). "y_pixels[k] - 2*character_height" comes from
#subtracting the "character_height" from the line y minimum, which is equal to
#"y_pixels[k] - character_height". "(y_pixels[k]-character_height)-lines_y_min_maxes[-1][0]"
#refers to the difference between the y minima of the pixel under investigation at
#index "i" of "y_pixels" and that of the last y pixel included in the list
#"lines_y_min_maxes". If the difference is less than "character_height",
#then the lines are overlapping and y_pixels[k] is therefore not
#included in the list "lines_y_min_maxes".
if (y_pixels[k] - 2*character_height <= 0 or y_pixels[k] + character_height >= imgheight or
len(lines_y_min_maxes) > 0 and
(y_pixels[k]-character_height)-lines_y_min_maxes[-1][0] < character_height):
pass
else:
lines_y_min_maxes.append([y_pixels[k] - character_height, y_pixels[k]])
#A maximum of 19 lines can be written in landscape mode on a 8 1/2" by 11" (A4)
#sheet of paper on a Perkins brailler. Only the cutoff values for lists
#"lines_y_min_maxes" with a maximum of 19 elements will thus be included in
#the list "cutoff_results".
if len(lines_y_min_maxes) <= 19:
cutoff_results.append([len(lines_y_min_maxes), cutoff])
#As the cutoff value increases, the number of y pixels above the threshold
#(having a sum of non-white pixels along the x axis for a given y coordinate
#above the cutoff value) will be more scarce. This effectively results in a
#decreased number of y coordinates in the array "y_pixels" that can be compared
#to the last entry in order to determine whether the y coordinate will
#constitute the start of a new line. Also, the same maximal amount of lines
#found during segmentation (up to a maximum of 19 lines) could be observed
#at several different cutoff values. Consequently, particularly when analyzing
#images with very faint shadows, the lowest cutoff value associated with the
#maximal amount of elements in the "lines_y_min_maxes" list should be selected.
#The lowest cutoff value will likely give better segmentation results, as it is based on
#more y coordinates to start with, effectively increasing the resolution of the
#segmentation.
cutoff_results = sorted(cutoff_results)
max_number_of_lines = max([cutoff_result[0] for cutoff_result in cutoff_results])
cutoff = next(cutoff_result for cutoff_result in cutoff_results if cutoff_result[0] == max_number_of_lines)[1]
y_pixels = np.sum(image_filtered, axis=0)
y_pixels = np.where(y_pixels > cutoff)[0]
lines_y_min_maxes = []
for l in range(len(y_pixels)):
#if the difference between the pixel numbers between the curent and the next
#y pixel in the "y_pixels" array is over 15 pixels (indicating a new line,
#as dots within a braille cell would be within 15 pixels), then the line
#is registered in the "lines_y_min_maxes" list. As all braille cells have
#the same height (around 90 pixels), the minimum y value can be found
#by subtracting "character_height" from the pixel number of the current
#y pixel ("y_pixels[l]").
if l < len(y_pixels)-1 and y_pixels[l+1] - y_pixels[l] > 15:
if (y_pixels[l] - 2*character_height <= 0 or y_pixels[l] + character_height >= imgheight or
len(lines_y_min_maxes) > 0 and
(y_pixels[l]-character_height)-lines_y_min_maxes[-1][0] < character_height):
pass
else:
lines_y_min_maxes.append([y_pixels[l] - character_height, y_pixels[l]])
characters_x_min_maxes = []
#The minimum x pixel "x_min" is initialized at 282, which is the x pixel (in landscape mode,
#in an image scanned at 300 dpi in portrait mode, with the left margin facing up) at which
#the first braille cell begins in each line, with the left margin set at its minimal setting
#on the Perkins Brailler.
x_min = 282
for _ in range(41):
characters_x_min_maxes.append([x_min, x_min + character_width])
x_min += character_width + 12
chars_x_y_coordinates = []
#This loop is proceeding in reverse order, as the origin of the scanned image (0,0 x, y coordinates)
#lies at the bottom of the page. In order for the characters to be cropped line by line starting from
#the top of the page, the loop needs to begin from the end of the "lines_y_min_maxes" list.
for m in range (len(lines_y_min_maxes)-1, -1, -1):
#The "if" statement excludes detected characters if they are overlapping with the upper or lower borders
#of the page. The scanned image might have darkened edges, as the scanner plate is larger than the page.
#Also, the "if" statement excludes overlapping lines (of which the difference between the y minima is less
#than "character_height").
line_index = lines_y_min_maxes.index(lines_y_min_maxes[m])
if ((lines_y_min_maxes[m][0] - character_height <= 0 or
lines_y_min_maxes[m][1] + character_height >= imgheight) or
(line_index > 0 and lines_y_min_maxes[m][0]-lines_y_min_maxes[line_index-1][0] < character_height)):
pass
#Drawing the rectangles in green on a copy of the image in the "Page image files with rectangles"
#folder to check whether the coordinates of the characters line up well. When drawing the character
#rectangles, the y coordinates are given before the x coordinates, as the image was written in
#landscape mode but scanned in portrait mode. Also, the list "chars_x_y_coordinates" is populated
#with the character coordinates.
else:
for character_x_min_max in characters_x_min_maxes:
(chars_x_y_coordinates.append([[character_x_min_max[0], lines_y_min_maxes[m][0]],
[character_x_min_max[1], lines_y_min_maxes[m][1]]]))
(cv2.rectangle(text_image_copy, (lines_y_min_maxes[m][0], character_x_min_max[0]),
(lines_y_min_maxes[m][1], character_x_min_max[1]), (0,255,0),3))
#If the "Page image files with rectangles" folder doesn't already
#exist in the working folder, it will be created.
if not os.path.exists(cwd + "/Page image files with rectangles/"):
os.makedirs(cwd + "/Page image files with rectangles/")
(cv2.imwrite(cwd + "/Page image files with rectangles/"
+ JPEG_file_names[i][:-4] + ' with character rectangles.jpg', text_image_copy))
return chars_x_y_coordinates
#The list "JPEG_file_names" is populated with the ".jpg" file names in
#the "Training&Validation Data" folder.
JPEG_file_names = ([file_name for file_name in sorted(os.listdir(cwd +
"/OCR Raw Data/")) if file_name[-4:] == ".jpg"])
#Generating cropped character images from the image files listed in the "JPEG_file_names"
#list and storing them in an image folder. These cropped character images will be deleted
#further on in the code (see comments below) and the image folder name is extracted from
#the first image name in the "image_names" list, including all characters up to the last
#hyphen (e.g. "Alice's Adventures in Wonderland Chapter 1-0001.jpg" would
#give the following extracted name: "Alice's Adventures in Wonderland Chapter 1")
hyphen_matches = re.finditer("-", JPEG_file_names[0])
hyphen_indices = []
for match in hyphen_matches:
hyphen_indices.append(match.start())
OCR_text_file_name = JPEG_file_names[0][:hyphen_indices[-1]]
path = cwd + "/OCR Predictions/" + OCR_text_file_name + "/"
if not os.path.exists(path):
os.makedirs(path)
print("Currently processing a total of " + str(len(JPEG_file_names)) +
' JPEG scanned images of braille text written \non the Perkins Brailler. ' +
'For best results, these should be scanned as grayscale \nJPEG images on a ' +
'flatbed scanner at a resolution of 300 dpi.\n')
#Import the convoluted neural network (cnn) deep learning model for OCR prediction.
#My optimal model trained on 58 braille pages typed on 8 1/2" x 11" pages in landscape mode
#on a Perkins Brailler. The model was trained using a batch size of 64, a learning rate of 0.005
#and 3 epochs of training, yieling a validation accuracy of 99.9777% (about one error per 4,500 characters!).
learn = load_learner(cwd + '/Model_Perkins_Brailler_acc9997')
#This code obtains the individual character coordinates from the image files
#listed in the "JPEG_file_names" list and generates JPEG images with overlaid
#character rectangles, named after the original files, but with the added
#"with character rectangles" suffix.
with alive_bar(len(JPEG_file_names)) as bar:
character_string = ""
with open(path + OCR_text_file_name + '-OCR results.txt', 'a+') as f:
for i in range(len(JPEG_file_names)):
#Insert two new lines ("\n\n") at the beginning of every page after the
#first page ("JPEG_file_names[0]"). This way, every page in the ".txt"
#file will be separated by an empty line, to facilitate making corrections
#if needed. If the ".txt" file is resubmitted to the present code to generate
#updated RTF and PEF files reflecting the corrections, these "\n\n" would be
#removed to ensure that no superfluous line breaks make their way into the
#final documents. Furthermore, an empty braille cell is added to the end of
#"character_string" to make sure there is a space in between the last word of
#a page and the first word of the next page. Any superfluous empty braille cells
#will be removed later in the code.
if JPEG_file_names.index(JPEG_file_names[i]) > 0:
f.write("\n\n")
current_page_string += "⠀"
text_image = cv2.imread(cwd + "/OCR Raw Data/"
+ str(JPEG_file_names[i]))
text_image_copy = text_image.copy()
#Convert image from RGB to grayscale
text_image_gray = cv2.cvtColor(text_image, cv2.COLOR_BGR2GRAY)
'''CHARACTER WIDTH AND SPACING PARAMETERS'''
#The "character_width" and "character_height" parameters were
#based on the pixel counts for the braille cells in the JPEG
#images generated above at a resolution of 300 dpi.
character_width = 60
character_height = 90
#The function "get_character_coordinates" will pass every grayscale
#JPEG image and extract the braille character coordinates allowing
#to overlay green rectangles to ensure proper segmentation.
chars_x_y_coordinates = get_character_x_y_coordinates(text_image_gray)
char_files = []
for j in range(len(chars_x_y_coordinates)):
(cv2.imwrite(path + str(j) + ".jpg",
(text_image_gray[chars_x_y_coordinates[j][0][0]-10:chars_x_y_coordinates[j][1][0]+10,
chars_x_y_coordinates[j][0][1]-10:chars_x_y_coordinates[j][1][1]+10])))
char_files.append(path + str(j) + ".jpg")
#Generate batch of individual character ".jpg" images to be submitted
#to the model for prediction.
data_block = DataBlock(
blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files, batch_tfms = Normalize()
)
dls = data_block.dataloaders(path, bs=64)
dl = learn.dls.test_dl(char_files, shuffle=False)
#Obtain softmax results in the form of a one-hot vector per character
preds = learn.get_preds(dl=dl)[0].softmax(dim=1)
#Determine which is the category index for the argmax of the character one-hot vectors.
preds_argmax = preds.argmax(dim=1).tolist()
#Convert the category index for each character to its label and assemble
#a list of labels by list comprehension.
character_list = [learn.dls.vocab[preds_argmax[i]] for i in range(len(preds_argmax))]
#If you want to print out the dictionary mapping the labels to the label
#indices, uncomment the following line:
# print(learn.dls.vocab.o2i)
#Once the "character_list" list of predicted characters has been populated, delete the
#individual character ".jpg" images used for OCR (you can comment out the following lines of
#code should you want to retain them for troubleshooting purposes).
for i in range(len(character_list)):
os.remove(path + str(i) + '.jpg')
#Substitute the actual character labels for the labels that were written in long
#form for compatibility reasons ("empty_braille_cell").
for i in range(len(character_list)-1, -1, -1):
#If the label is "empty_braille_cell", substitute for an actual empty braille cell "⠀".
if character_list[i] == "empty_braille_cell":
character_list[i] = "⠀"
#As the character cropping continued until the end of every line whether or not
#it still contained characters, there could be a series of superfluous spaces
#at the end of a line. That is to say, if a line contains less than 41 actual
#braille characters (not empty braille cells), there will be one or more superfluous
#empty braille cells at the end of it. There is no need to use the line continuation
#with or without space braille symbols, as the code will automatically append the
#contents of the next line to the previous line.
#The code "int(len(character_list)/41" determines the number of lines in
#the page, by dividing the length of the "characer_list" list (containing every
#character and empty braille cell of the page) by the number of characters per line,
#which is 41. The "for" loop proceeds in reverse order so as not to raise indexing
#issues as superfluous spaces are removed. The "while" loop will remove all but one
#empty braille cells from lines that end with more than one successive empty braille cell.
#For example, if there are three empty braille cells at the end of a line, only one will
#remain after the whille loop. Also, the lines comprised only of empty braille cells will
#be skipped over and instances of 41 successive empty braille cells will be removed after
#joining the elements of "character_list" with an empty string.
for i in range((int(len(character_list)/41))-1, -1, -1):
current_index = i*41-2
if character_list[current_index-40:current_index+1] != 41*["⠀"] and character_list[current_index] == "⠀" and character_list[current_index+1] == "⠀":
while character_list[current_index] == "⠀":
character_list.pop(current_index)
current_index -= 1
#The elements of the "text" list are joined with empty stings,
#as the empty braille cells already act as spaces. Instances of
#41 successive empty braille cells will be removed after
#joining the elements of "character_list" with an empty string.
#This will remove any empty lines that were skipped over in the
#"for i in range((int(len(character_list)/41))-1, -1, -1):" loop
#above.
current_page_string = "".join(character_list).replace(41*"⠀", "")
#Any instances of at least two successive full braille cells
#(denoting typos) are then removed. The "current_page_string"
#is then written to the ".txt" file.
current_page_string = re.sub("⠿(⠿+)","", current_page_string)
#The line continuations with a space braille symbols ("⠐⠐") are changed for a space,
#The spaces need to be added after removal of the superfluous spaces (code directly above),
#otherwise they wouldn't be retained. As the code already stitches any given line to the
#contents of the preceding line, line continuations without spaces ("⠐") shouldn't be used
#in the current application, as these would lead to confusion with other braille characters,
#such as intial-letter contractions.
#The line continuation braille symbols with spaces need to be removed, as the PEF file will
#likely not have the same margins as the Perkins Brailler and as they are irrelevant in the
#printed English RTF document.
current_page_string.replace("⠐⠐", "⠀")
character_string += current_page_string
f.write(current_page_string)
bar()
#Should a text file name be provided for a modified braille text file
#(found within the "OCR Raw Data" subfolder of the current working folder,
#ex: python3 e-braille-tales.py "my_file.txt") the OCR code will be
#skipped altogether and only the writing of the Portable Embosser Format
#(PEF) file and transcription to printed English (RTF document) will be performed.
else:
file_name = sys.argv[1]
#Extracting folder name from file name, up to the last hyphen.
#If there isn't already a subfolder by that name in the "OCR Predictions"
#folder, such a one will be created.
hyphen_matches = re.finditer("-", file_name)
hyphen_indices = []
for match in hyphen_matches:
hyphen_indices.append(match.start())
OCR_text_file_name = file_name[:hyphen_indices[-1]]
path = cwd + "/OCR Predictions/" + OCR_text_file_name + "/"
if not os.path.exists(path):
os.makedirs(path)
#The modified text file present in the "OCR Raw Data" subfolder
#is opened and its text (after removal of the "\n\n" carriage
#returns that were included to facilitate reviewing the braille text
#page by page) is stored as a string in "character_string".
with open(cwd + "/OCR Raw Data/" + file_name, "r") as g:
character_string = g.read().replace("\n", "")
#The Portable Embosser Format (PEF) file needs to be generated before removing
#braille characters such as "dot locator for mention" or "transcriber-defined
#typeform indicators", as these symbols could be relevant to the braille reader.
#The number of columns and rows per page will be defined below and could be changed
#by the users based on the specifications of their braille embosser/e-reader.
#The section break RTF command that starts at a new page ("\sbkpage"), which maps to the
#following braille characters ("⠸⠡⠎⠃⠅⠏⠁⠛⠑"), will be changed for the appropriate
#PEF section tags (<section> </section>). A similar approach will be taken for
#page breaks "\page" (mapping to the braille "⠸⠡⠏⠁⠛⠑"), and carriage returns
#"\line" (corresponding to the braille "⠸⠡⠇⠔⠑")
columns_per_page = 40
lines_per_page = 25
#As the braille characters for the section, page and line breaks will be converted
#to the PEF tags within the braille string, a copy of "character_string is made".
pef_file_string = character_string
#Upon assembling the PEF file, opening and closing volume, section, page and row
#tags will be included, with the "pef_file_string" sandwitched in between.
#Consequently, upon finding a RTF command for a section, page or line break,
#only the closing tag needs to be included along with the opening tag for the
#next section, page or line (and the appropriate number of new line ("\n")
#and tab ("\t") formatting elements.) The braille equivalent of the RTF command
#"\tab" ("⠸⠡⠞⠁⠃") will be changed to two successive empty braille cells ("⠀⠀"),
#while "\par" ("⠸⠡⠏⠜") will be mapped to a line break followed by two successive
#empty braille cells ("</row>\n\t\t\t\t\t<row>⠀⠀"). An empty braille cell is
#included at the end of every pattern in the lines below to avoid being left over
#with excessive spaces, as PEF tags don't allow for optional spaces the way RTF
#tags do, and since the braille will not be submitted to transcription code that
#requires determining whether the braille characters that follow the patterns
#are free standing or not.
pef_file_string = re.sub("⠸⠡⠞⠁⠃⠀", "⠀⠀", pef_file_string)
pef_file_string = re.sub("⠸⠡⠇⠔⠑⠀", "</row>\n\t\t\t\t\t<row>", pef_file_string)
pef_file_string = re.sub("⠸⠡⠏⠜⠀", "</row>\n\t\t\t\t\t<row>⠀⠀", pef_file_string)
pef_file_string = re.sub("⠸⠡⠏⠁⠛⠑⠀", "</row>\n\t\t\t\t</page>\n\t\t\t\t<page>\n\t\t\t\t\t<row>", pef_file_string)
pef_file_string = re.sub("⠸⠡⠎⠃⠅⠏⠁⠛⠑⠀", "</row>\n\t\t\t\t</page>\n\t\t\t</section>\n\t\t\t<section>\n\t\t\t\t</page>\n\t\t\t\t\t<row>", pef_file_string)
#The "pef_rows" list will be populated with the braille characters,
#up to a maximal length of "columns_per_page". The "current_row_length"
#counter (initialized at 0 and reinitialized every time there is a
#section, row or page break) will keep track of the length of the pef
#row and determine when it is time to start a new row.
pef_rows = []
current_row_length = 0
#The list "non_empty_braille_cells" is assembled by splitting
#"pef_file_string" at every empty braille cell ("⠀"). The
#elements of this list will be appended to the list "pef_rows"
#while "current_row_length" is below "columns_per_page" + 1
#(+1 being added to accomodate an empty braille cell after
#adding the "non-non_empty_braille_cells" element "non-empty".)
#If the "non-empty" element is comprised only of braille
#characters (and does not contain an "r" present in all of
#the "re.sub" substitutions above for section, page and line
#breaks, as each one of these contains a <row> tag having
#an "r" in it) the length of the "non_empty" element will be
#simply determined by "len(non_empty)" for computational
#efficiency and the "current_row_length" will be incremented
#by that amount. For "non-empty" elements containing an "r",
#the length will be determined by determining the length
#of the match corresponding to a pattern excluding the
#the following: "\n\t<>sectionrowpage/\\\\" in order to
#only account for the length of braille characters (if any)
#in the "non-empty" element.
non_empty_braille_cells = re.split("⠀", pef_file_string)
for non_empty in non_empty_braille_cells:
if "r" not in non_empty:
length_non_empty = len(non_empty)
elif "r" in non_empty:
pattern = re.compile("[^ \n\t<>sectionrowpage/\\\\]")
match_non_empty = re.match(pattern, non_empty)
length_non_empty = len(match.group(0))
current_row_length = length_non_empty
#If the current line can accomodate the "non_empty" element
#in addition to an empty braille cell, then the current
#"pef_row" is extended with these characters and the
#"current_row_length" is incremented by the length
#of "non_empty" plus one for the empty braille cell.
if current_row_length + length_non_empty < columns_per_page:
pef_rows.extend(non_empty + "⠀")
current_row_length += length_non_empty+1
#Otherwise, a new row is started, the current "non_empty"
#element is appended to it along with an empty braille cell,
#and the "current_row_length" counter is reset to the current
#contents of the new row.
else:
pef_rows.append("</row>\n\t\t\t\t\t<row>" + non_empty + "⠀")
current_row_length = length_non_empty+1
#"pef_file_string" is overwritten with the joining of every element
#of the "pef_rows" list with empty strings, as the empty braille cells
#have already been added in between the "non_empty_braille_cells" elements.
pef_file_string = "".join(pef_rows)
#After having added all of the line and section breaks, the rows will need
#to be assembled in pages, up to the number of "lines_per_page". All of the
#matche indices for the closing pef tags for page and row breaks will be
#assembled in two lists and the row counter "row_count" will keep track of
#the number of lines per page and determine when to add another page break.
page_closing_tag_matches = re.finditer("</page>", pef_file_string)
page_closing_tag_match_indices = [match.start() for match in page_closing_tag_matches]
row_closing_tag_matches = re.finditer("</row>", pef_file_string)
row_closing_tag_match_indices = [match.start() for match in row_closing_tag_matches]
row_count = 0
#The "for" loop proceeds in reverse order, so as to avoid indexing issues
#while adding page break pef tags to the "pef_file_string". Provided that
#"page_closing_tag_match_indices" is not an empty list and contains the
#indices at which page break closing pef tags are located, verification
#will be made that the current row pef closing tag under investigation at
#index "row_closing_tag_match_indices[i]" lies after the next page break
#(" page_closing_tag_match_indices[-1]"). Should that not be the case,
#the row_count is reinitialized at 0, as a new page has begun and so no
#further page breaks are required until the counter reaches the maximal
#line number per page. The code effectively walks up the "pef_file_string"
#from the end to the beginning and inserts page breaks whenever "row_count"
#equals the "lines_per_page" minus one, insofar as the next page break has
#not yet been reached.
for i in range(len(row_closing_tag_match_indices)-1, -1, -1):
if (i > 0 and row_count < lines_per_page - 1 and
(page_closing_tag_match_indices == [] or row_closing_tag_match_indices[i] > page_closing_tag_match_indices[-1])):
row_count += 1
elif (i > 0 and row_count == lines_per_page - 1 and
(page_closing_tag_match_indices == [] or row_closing_tag_match_indices[i] > page_closing_tag_match_indices[-1])):
pef_file_string = (pef_file_string[:row_closing_tag_match_indices[i]+7] +
"\t\t\t\t</page>\n\t\t\t\t<page>\n\t" + pef_file_string[row_closing_tag_match_indices[i]+8:])
row_count = 0
elif (page_closing_tag_match_indices != [] and
row_closing_tag_match_indices[i] <= page_closing_tag_match_indices[-1]):
page_closing_tag_match_indices.pop()
row_count = 0
#PEF file is assembled by including opening and closing volume, section,
#page and row tags, with the "pef_file_string" sandwitched in between.
#The variables "columns_per_page" and "lines_per_page" are included in the
#"<volume>" PEF tag to ensure that the PEF file is generated according
#to the users specifications.
with open(path + OCR_text_file_name + ".pef", "w") as pef_file:
pef_file.write("""<?xml version="1.0" encoding="UTF-8"?>
<pef version="2008-1" xmlns="http://www.daisy.org/ns/2008/pef">
<head>
<meta xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:format>application/x-pef+xml</dc:format>
<dc:identifier>org.pef-format.00002</dc:identifier>
</meta>
</head>
<body>
<volume cols=""" + '"' + str(columns_per_page) + '"' + " rows=" + '"'
+ str(lines_per_page) + '"' + """ rowgap="0" duplex="false">
<section>
<page>
<row>⠀""" + pef_file_string +
"""</row>
</page>
</section>
</volume>
</body>
</pef>""")
#Removing "dot locator for mention" from the braille characters, as these won't be
#needed in the English print transcribed form. However, these will remain in the
#Portable Embosser Format (PEF) files. This needs to be done before removing the
#typos, in the event that there was a typo immediately after a "dot locator for mention"
#symbol, which would result in at least two consecutive "⠿" symbols that would be removed,
#leaving behind a "⠨" character. (This step will not be performed when generating the PEF file.)
#Also, an empty braille cell is added at the end of the OCR document because some of the
#transcription Python code below looks at the character following a match in order to decide
#on the transcription outcome, and it wouldn't make sense to add specific "else" statements
#to account for all these case scenarios, as the words wouldn't normally be found at the very
#end of the document in the first place, but would rather be followed by a punctuation mark.
#This superfluous space will be removed at the end of the code.
dot_locator = re.compile("⠨⠿")
new_character_string = re.sub(dot_locator,"", character_string) + "⠀"
#The transcriber-defined typeform indicators must be removed from the printed English transcription,
#(This step will not be performed when generating the PEF file.)
tdti_list = ["⠈⠼⠂", "⠈⠼⠆", "⠈⠼⠶", "⠈⠼⠠", "⠘⠼⠂", "⠘⠼⠆", "⠘⠼⠶", "⠘⠼⠠", "⠸⠼⠂", "⠸⠼⠆", "⠸⠼⠶",
"⠸⠼⠠", "⠐⠼⠂", "⠐⠼⠆", "⠐⠼⠶", "⠐⠼⠠", "⠨⠼⠂", "⠨⠼⠆", "⠨⠼⠶" "⠨⠼⠠"]
for tdti in tdti_list:
new_character_string = re.sub(tdti, "", new_character_string)
#I didn't include the "horizontal line mode indicator, ⠐⠒", as I don't believe that this application
#would be used to draw diagrams anyways. Should it be considered by the current code, it would need
#to be removed in the English printed format, as was done above for other characters.
#The following three final-letter groupsigns map to printed English suffixes (less, ness, sion)
#that can also form whole words. These braille groupsigns therefore cannot be used to
#designate a whole word, in order to avoid such ambiguities as " ⠰⠎ " meaning "grade 1 's'".
#Substitutions are thus made only if the matches are preceded by a braille character that maps
#to a letter or to letters. Because of this ambiguity, the transcription of the final-letter
#groupsign "ness" needs to be done before dealing with the Grade I. Handling the final-letter
#groupsigns "less" and "sion" before dealing with Grade I shouldn't pose a problem,
#as the first character of both these groupsigns ("⠨") isn't a letter and therefore wouldn't
#be found in a Group I passage.
braille_alphabet = ["⠁", "⠃", "⠉", "⠙", "⠑", "⠋", "⠛", "⠓", "⠊", "⠚", "⠅", "⠇", "⠍", "⠝",
"⠕", "⠏", "⠟", "⠗", "⠎", "⠞", "⠥", "⠧", "⠺", "⠭", "⠽", "⠵", "a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
contraction_characters = ["⠡", "⠩", "⠹", "⠱", "⠳", "⠌", "⠣", "⠫", "⠻", "⠪", "⠜", "⠬",
"⠲", "⠢", "⠔", "⠯", "⠿", "⠷", "⠮", "⠾"]
ambiguous_characters = ["⠆", "⠒", "⠖", "⠶", "⠂"]
groupsign_list = [["⠨⠎", "less"],["⠰⠎", "ness"],["⠨⠝", "sion"]]
for groupsign in groupsign_list:
groupsign_matches = re.finditer(groupsign[0], new_character_string)
groupsign_match_indices = [match.start() for match in groupsign_matches]
#The substitutions proceed in reverse order (starting from the last hit in "new_character_string"),
#since every two braille character sequence is changed for their four-letter long printed English
#equivalent. This would result in indexing issues if the changes were performed from the
#beginning of the document (from the first hit in "new_character_string").
for i in range(len(groupsign_match_indices)-1, -1, -1):
if (groupsign_match_indices[i] > 0 and new_character_string[groupsign_match_indices[i]-1] in
(braille_alphabet + contraction_characters + ambiguous_characters)):
new_character_string = (new_character_string[:groupsign_match_indices[i]]
+ groupsign[1] + new_character_string[groupsign_match_indices[i]+2:])
#The following section deals with grade I passage, word and symbol indicators.
#This section, along with the numerals section below, needs to be carried out before
#doing any other changes to the document, to avoid mixups. Whenever a grade I symbol
#indicator ("⠰") is found before "⠔" or "⠢", it is changed for
#"⠰⠔" (superscript indicator) or "⠰⠢" (subscript indicator), respectively,
#as the grade I symbol would otherwise be removed from "⠔" or "⠢" when the code skips
#over the index at which it found "⠰" (new_character_string[index_grade_I_terminator+2:]).
#The superscript and subscript indicators will be processed towards the end of the code,
#hence the need to keep them in "new_character_string" until then.
grade_I_characters = {"⠁":"a", "⠃":"b", "⠉":"c", "⠙":"d", "⠑":"e",
"⠋":"f", "⠛":"g", "⠓":"h", "⠊":"i", "⠚":"j", "⠅":"k", "⠇":"l", "⠍":"m",
"⠝":"n", "⠕":"o", "⠏":"p", "⠟":"q", "⠗":"r", "⠎":"s", "⠞":"t", "⠥":"u",
"⠧":"v", "⠺":"w", "⠭":"x", "⠽":"y", "⠵":"z", "⠂":",", "⠲":".", "⠦":"?",
"⠖":"!", "⠄":"’", "⠤":"-", "⠦":'“', "⠴":'”', "⠒": ":",
"⠆": ";", "⠶": r"\'27", "⠔":"⠰⠔", "⠢":"⠰⠢"}
#When the grade I passage indicator "⠰⠰⠰" is encountered, grade I transcription
#continues until the grade I terminator symbol ("⠰⠄") is met.
mapping_table_grade_I = new_character_string.maketrans(grade_I_characters)
grade_I_passage_matches = re.finditer("⠰⠰⠰", new_character_string)
grade_I_passage_match_indices = [match.start() for match in grade_I_passage_matches]
for i in range(len(grade_I_passage_match_indices)-1, -1, -1):
#A try except statement is included in case the user forgot to include a grade I braille
#terminator for the grade I passage, as the program result in a ValueError would be returned
#if there were no terminators after the grade I passage indicator. If a terminator was found
#after the grade I passage initiator ("⠰⠰⠰"), the "new_character_string" is updated by first
#adding all the characters up to "⠰⠰⠰" (skipping over the grade I initiator). The grade I
#transcribed passage is then added and the remainder of "new_character_string" starting three
#characters after "index_grade_I_terminator", such that the grade I initiator "⠰⠰⠰" is
#not included in the updated version of "new_character_string". Similarly, "+2" is added to
#the hit index in "new_character_string[index_grade_I_terminator+2:]", in order to skip over the
#grade I terminator "⠰⠄".
try:
index_grade_I_terminator = new_character_string.index("⠰⠄", grade_I_passage_match_indices[i]+3)
passage_string = (new_character_string[grade_I_passage_match_indices[i]+3:index_grade_I_terminator]
.translate(mapping_table_grade_I))
new_character_string = (new_character_string[:grade_I_passage_match_indices[i]] +
passage_string + new_character_string[index_grade_I_terminator+2:])
except:
#An empty braille cell (u"\u2800") must be included after the error message within
#brackets found below, so that the code can check for wordsigns that must stand alone
#(be preceded by a space, hyphen/dashes, formatting indicators, suitable punctuation marks).
#The empty braille cell will act as a "stand alone" delimitor for any wordsigns after it.
new_character_string = (new_character_string[:grade_I_passage_match_indices[i]] +
"[Transcription note: a grade I passage indicator was located here, but no grade I terminator was found after it.]⠀" +
new_character_string[grade_I_passage_match_indices[i]+3:])
#When the grade I word indicator "⠰⠰" is encountered, grade I transcription continues
#until one of the following are met: an empty braille cell (u"\u2800"), the grade I
#termination symbol ("⠰⠄") or a hyphen ("⠤" or dash symbols such as
#dash/en dash("⠠⠤"), long dash/em dash("⠐⠠⠤"), or underscore ("⠨⠤")).
grade_I_word_matches = re.finditer("⠰⠰", new_character_string)
grade_I_word_match_indices = [match.start() for match in grade_I_word_matches]
for i in range(len(grade_I_word_match_indices)-1, -1, -1):
word_starting_index = grade_I_word_match_indices[i]+2
#The indices of all possible terminators are determined using the find() method.
#Should there be no terminator found for a given terminator category, the
#find() function will return -1. The lengths of the terminators are included
#(as the second element (index 1) in each list) in order to only skip over the
#grade I terminator symbols ("⠰⠄").
next_empty_braille_cell = [new_character_string.find(u"\u2800", word_starting_index), 0]
next_grade_I_terminator = [new_character_string.find("⠰⠄", word_starting_index), 2]
next_underscore = [new_character_string.find("⠨⠤", word_starting_index), 0]
next_dash = [new_character_string.find("⠠⠤", word_starting_index), 0]
next_long_dash = [new_character_string.find("⠐⠠⠤", word_starting_index),0]
next_hyphen = [new_character_string.find("⠤", word_starting_index), 0]
#The results from the terminator searches above are combined in the list of lists
#"index_categories" and sorted according to their first element (index 0), such
#that the earliest occurence of a terminator is the first element of the list of lists.
index_categories = sorted([next_empty_braille_cell, next_grade_I_terminator,
next_underscore, next_dash, next_long_dash, next_hyphen], key=lambda x:x[0])
#The indices in the sorted list "index_categories" that are not -1 (no found hits)
#are pooled in the list "terminator_indices" and the first and earliest index is
#selected as the "index_next_grade_I_terminator". The length of the terminator
#is stored in the "terminator_length" variable.
terminator_indices = [element for element in index_categories if element[0] != -1]
#The "index_grade_I_terminator" is initialized to None, as indexing the list
#terminator_indices will only be possible if a terminator was found after "⠰⠰"
index_next_grade_I_terminator = None
if terminator_indices != []:
index_next_grade_I_terminator = terminator_indices[0][0]
terminator_length = terminator_indices[0][1]
#If a terminator was found after the grade I word initiator ("⠰⠰"), the
#"new_character_string" is updated by first adding all the characters up
#to "⠰⠰" (skipping over the grade I initiator). The grade I transcribed
#word is then added and the remainder of "new_character_string" starting
#from the terminator index (except for the grade I terminator symbols ("⠰⠄"),
#which are skipped over, as the "terminator_length" is then 2) is then appended,
#hence adding "terminator_length" to the index of the terminator.
if index_next_grade_I_terminator != None:
word_string = (new_character_string[word_starting_index:index_next_grade_I_terminator]
.translate(mapping_table_grade_I))
new_character_string = (new_character_string[:grade_I_word_match_indices[i]] +
word_string + new_character_string[index_next_grade_I_terminator+terminator_length:])
#If there isn't a terminator after the grade I word, the remainder of the text will be
#transcribed using grade I braille.
elif index_next_grade_I_terminator == None:
word_string = new_character_string[word_starting_index:].translate(mapping_table_grade_I)
new_character_string = (new_character_string[:grade_I_word_match_indices[i]] +
word_string)
#In all these cases, the preceding character to the final-letter groupsigns should be a braille character
#mapping to a letter. Conversely, the single letters preceded by a Grade I symbol shouldn't be preceded
#by a letter before the Grade I symbol ("⠰"). The printed English letters were added to the "braille_alphabet"
#list to take into account the braille characters that are already converted to printed English letters.
grade_I_ambiguities = [[["⠑", "e"], ["⠑", "ence"]], [["⠛", "g"], ["⠰⠛", "ong"]], [["⠇", "l"],
["⠇", "ful"]], [["⠝", "n"], ["⠝", "tion"]], [["⠞", "t"], ["⠞", "ment"]], [["⠽", "y"], ["⠽", "ity"]]]
grade_I_symbol_matches = re.finditer("⠰", new_character_string)
grade_I_symbol_match_indices = [match.start() for match in grade_I_symbol_matches]
for i in range(len(grade_I_symbol_match_indices)-1, -1, -1):
character_after_grade_I_symbol = new_character_string[grade_I_symbol_match_indices[i]+1]
#The "match_found" variable will be set to "True" if the character following the grade I symbol
#corresponds to one of the following ambiguous characters: "⠑", "⠛", "⠇", "⠝", "⠞", "⠽".
match_found = False
for char in grade_I_ambiguities:
#If a match was found in "grade_I_ambiguities" and that the preceding braille
#character maps to a letter or dash (although the final-letter groupsigns should
#only follow letters according to the National Federation of the Blind (NFB), but dashes/hyphens
#were allowed in this code for more leniency as to where a hyphen may be placed in a word),
#then the ambiguous character is determined to be the corresponding final
#letter groupsign, as a letter character wouldn't precede a grade I symbol character.
#"+2" is added to the hit index in "new_character_string[grade_I_symbol_match_indices[i] + 2:]",
#as the index of the hit itself is the grade I symbol "⠰", and since the grade I symbol and its
#following braille character need to be skipped when adding the remainder of the
#"new_character_string" after the hit.
if (char[0][0] == character_after_grade_I_symbol and
new_character_string[grade_I_symbol_match_indices[i]-1] in
(braille_alphabet + contraction_characters + ["⠤"])):
new_character_string = (new_character_string[:grade_I_symbol_match_indices[i]]
+ char[1][1] + new_character_string[grade_I_symbol_match_indices[i] + 2:])
match_found = True
#If a match was found in "grade_I_ambiguities" and that the preceding braille
#character does not map to a letter, then the ambiguous character is determined to be
#the grade I letter, as the final letter groupsigns need to be preceded by a letter.
elif (char[0][0] == character_after_grade_I_symbol and
new_character_string[grade_I_symbol_match_indices[i]-1] not in
(braille_alphabet + contraction_characters + ["⠤"])):
new_character_string = (new_character_string[:grade_I_symbol_match_indices[i]]
+ char[0][1] + new_character_string[grade_I_symbol_match_indices[i] + 2:])
match_found = True
#If no match was found in "grade_I_ambiguities" for the character following the grade I symbol,
#and there is only one character after the grade I symbol character, then that character is mapped
#to its letter.
if match_found == False and grade_I_symbol_match_indices[i] == len(new_character_string) -2:
try:
letter = grade_I_characters[character_after_grade_I_symbol]
new_character_string = (new_character_string[:grade_I_symbol_match_indices[i]]
+ letter)
except:
#If the character after the grade I symbol was not recognized as a letter, then the
#following error message will be included in the text. The character that was originally
#following the grade I symbol will directly follow the error message, hence the "+1"
#in "new_character_string[grade_I_symbol_match_indices[i]+1]".
new_character_string = (new_character_string[:grade_I_symbol_match_indices[i]] +
"[Transcription note: a grade I symbol character was found here, but the following character was not recognized as a letter, and so could not be transcribed in grade I.]⠀" +
new_character_string[grade_I_symbol_match_indices[i]+1])
#If no match was found in "grade_I_ambiguities" for the character following the grade I symbol,
#and that there are at least two characters following the grade I symbol character, then
#the character following the grade I symbol is mapped to its letter and the other characters
#following it are added at the end.
elif match_found == False:
try:
letter = grade_I_characters[character_after_grade_I_symbol]
new_character_string = (new_character_string[:grade_I_symbol_match_indices[i]]
+ letter + new_character_string[grade_I_symbol_match_indices[i] + 2:])
except:
#If the character after the grade I symbol was not recognized as a letter, then the
#following error message will be included in the text. The character that was originally
#following the grade I symbol will directly follow the error message, hence the "+1"
#in "new_character_string[grade_I_symbol_match_indices[i]+1]".
new_character_string = (new_character_string[:grade_I_symbol_match_indices[i]] +
"[Transcription note: a grade I symbol character was found here, but the following character was not recognized as a letter, and so could not be transcribed in grade I.]⠀" +
new_character_string[grade_I_symbol_match_indices[i]+1:])
#The following section deals with numerals, which are transcribed on a one-to-one basis
#based on their a-j braille equivalents. This section, along with the grade I section
#above, needs to be carried out before doing any other changes to the document, to avoid mixups.
numeral_characters = {"⠁":"1", "⠃":"2", "⠉":"3", "⠙":"4", "⠑":"5",
"⠋":"6", "⠛":"7", "⠓":"8", "⠊":"9", "⠚":"0", "⠂": ",", "⠲": ".", "⡈":"/"}
#When the numeric indicator "⠼" is encountered, transcription of the numerals continue as long as
#the following characters are encountered: the braille characters for letters "a" to "j",
#commas "⠂", periods "⠲" (or decimal points or computer dots) and fraction lines "⡈".
mapping_table_numerals = new_character_string.maketrans(numeral_characters)
numeric_symbol_matches = re.finditer("⠼", new_character_string)
numeric_symbol_match_indices = [match.start() for match in numeric_symbol_matches]
list_of_numeral_characters = ["⠁", "⠃", "⠉", "⠙", "⠑", "⠋", "⠛", "⠓", "⠊", "⠚", "⠂", "⠲", "⡈"]
#Looping through the "numeric_symbol_match_indices" list in reverse order, as some numeric symbols "⠼"
#will be removed as the braille digits are converted to the printed numbers. This way, we avoid staggering
#the indices.
for i in range(len(numeric_symbol_match_indices)-1, -1, -1):
#The "terminator_found" variable is set to its default value of "False" and will
#be changed to "True" when a character does not match one found in the "list_of_numeral_characters".
#The index of this character will be stored in the "index_numeral_terminator" variable and the "for j in..."
#loop will be broken. Since the character at the "index_numeral_terminator" is relevant and needs to
#be maintained in the updated "new_character_string", nothing is added to it when adding the
#remainder of the string after the hit ("new_character_string[index_numeral_terminator:]"), as
#opposed to some grade I examples above which had superfluous braille terminator characters "⠰⠄"
#that needed to be skipped over by adding +2 to the index of the terminator.
terminator_found = False
#The first numeric symbol match screened is actually the last one found in the document
#(to prevent staggering indices when removing the numeric indicator symbols "⠼"),
#when i equals the last index in the list "numeric_symbol_match_indices".
if i == len(numeric_symbol_match_indices)-1:
for j in range(numeric_symbol_match_indices[i]+1, len(new_character_string)):
if new_character_string[j] not in list_of_numeral_characters:
index_numeral_terminator = j
numeral_string = (
new_character_string[numeric_symbol_match_indices[i]+1:index_numeral_terminator]
.translate(mapping_table_numerals))
new_character_string = (new_character_string[:numeric_symbol_match_indices[i]] +
numeral_string + new_character_string[index_numeral_terminator:])
terminator_found = True
break
else:
for k in range(numeric_symbol_match_indices[i]+1, numeric_symbol_match_indices[i+1]):
if new_character_string[k] not in list_of_numeral_characters:
index_numeral_terminator = k
numeral_string = (
new_character_string[numeric_symbol_match_indices[i]+1:index_numeral_terminator]
.translate(mapping_table_numerals))
new_character_string = (new_character_string[:numeric_symbol_match_indices[i]] +
numeral_string + new_character_string[index_numeral_terminator:])
terminator_found = True
break
#In the event that only characters found in the list "list_of_numeral_characters" were
#encountered in the "for j (or k) in..." loop, then all the characters from the index
#new_character_string[numeric_symbol_match_indices[i]+1 (following the numeric symbol)
#up to the index of the following numeric symbol will be converted to numbers. In the
#case of the first numeric match analyzed (which is actually the last occurence of
#the numeric symbol in the document) the transcription to numbers occurs until the
#end of the document and "new_character_string[index_numeral_terminator:]" is not
#added after the "numeral_string".
if terminator_found == False and i == len(numeric_symbol_match_indices)-1:
numeral_string = (new_character_string[numeric_symbol_match_indices[i]+1:]
.translate(mapping_table_numerals))
new_character_string = (new_character_string[:numeric_symbol_match_indices[i]] +
numeral_string)
elif terminator_found == False and i != len(numeric_symbol_match_indices)-1:
index_numeral_terminator = numeric_symbol_match_indices[i+1]
numeral_string = (new_character_string[numeric_symbol_match_indices[i]+1:index_numeral_terminator]
.translate(mapping_table_numerals))
new_character_string = (new_character_string[:numeric_symbol_match_indices[i]] +
numeral_string + new_character_string[index_numeral_terminator:])
#Notice that "perceiving" is being substituted before "perceive", to avoid being left with "⠛",
#should the substitution proceed in the reverse order. The words in "shortform_words" are then
#be sorted by decreasing length of braille characters.
#Please consult the following reference for a list of UEB contractions:
#https://www.brailleauthority.org/ueb/symbols_list.pdf. All of the contractions and combined braille
#symbols must be processed before individually transcribing the remaining characters on a one to one basis
#to their printed English equivalents.
shortform_words = [['⠏⠻⠉⠧⠛', 'perceiving'], ['⠽⠗⠧⠎', 'yourselves'], ['⠮⠍⠧⠎', 'themselves'],
['⠗⠚⠉⠛', 'rejoicing'], ['⠗⠉⠧⠛', 'receiving'], ['⠏⠻⠉⠧', 'perceive'], ['⠳⠗⠧⠎', 'ourselves'],
['⠙⠉⠇⠛', 'declaring'], ['⠙⠉⠧⠛', 'deceiving'], ['⠒⠉⠧⠛', 'conceiving'], ['⠁⠋⠺⠎', 'afterwards'],
['⠽⠗⠋', 'yourself'], ['⠞⠛⠗', 'together'], ['⠹⠽⠋', 'thyself'], ['⠗⠚⠉', 'rejoice'], ['⠗⠉⠧', 'receive'],
['⠏⠻⠓', 'perhaps'], ['⠐⠕⠋', 'oneself'], ['⠝⠑⠊', 'neither'], ['⠝⠑⠉', 'necessary'], ['⠍⠽⠋', 'myself'],
['⠊⠍⠍', 'immediate'], ['⠓⠍⠋', 'himself'], ['⠓⠻⠋', 'herself'], ['⠛⠗⠞', 'great'], ['⠙⠉⠇', 'declare'],
['⠙⠉⠧', 'deceive'], ['⠒⠉⠧', 'conceive'], ['⠃⠗⠇', 'braille'], ['⠁⠇⠺', 'always'], ['⠁⠇⠞', 'altogether'],
['⠁⠇⠹', 'although'], ['⠁⠇⠗', 'already'], ['⠁⠇⠍', 'almost'], ['⠁⠛⠌', 'against'], ['⠁⠋⠝', 'afternoon'],
['⠁⠋⠺', 'afterward'], ['⠁⠉⠗', 'across'], ['⠁⠃⠧', 'above'], ['⠽⠗', 'your'], ['⠺⠙', 'would'], ['⠞⠝', 'tonight'],
['⠞⠍', 'tomorrow'], ['⠞⠙', 'today'], ['⠎⠡', 'such'], ['⠩⠙', 'should'], ['⠎⠙', 'said'], ['⠟⠅', 'quick'],
['⠏⠙', 'paid'], ['⠍⠌', 'must'], ['⠍⠡', 'much'], ['⠇⠇', 'little'], ['⠇⠗', 'letter'], ['⠭⠋', 'itself'],
['⠭⠎', 'its'], ['⠓⠍', 'him'], ['⠛⠙', 'good'], ['⠋⠗', 'friend'], ['⠋⠌', 'first'], ['⠑⠊', 'either'],
['⠉⠙', 'could'], ['⠡⠝', 'children'], ['⠃⠇', 'blind'], ['⠁⠇', 'also'], ['⠁⠛', 'again'],
['⠁⠋', 'after'], ['⠁⠉', 'according'], ['⠁⠃', 'about']]
for word in shortform_words:
word_length = len(word[0])
word_matches = re.finditer(word[0], new_character_string)
word_match_indices = [match.start() for match in word_matches]
for i in range(len(word_match_indices)-1, -1, -1):
#"word_match_indices[i] == len(new_character_string) - (word_length + 1)"
#means that there is only one braille character after the "word[0]" match.
#This is necessary, as an error would be raised if we were to look two
#characters ahead. "word_match_indices[i] + word_length" is looking at
#the braille character directly following the "word[0]" match. If there
#is only one braille character after the "word[0]" match and that braille
#character is either an empty braille cell (u"\u2800"), hyphen ("⠤"),
#period ("⠲"), apostrophe ("⠄"), comma ("⠂"), colon ("⠒"), semicolon ("⠆")
#question mark ("⠦"), exclamation mark ("⠖") or closing double quote ("⠴"),
#then "word[0]" meets the requirements to be free standing on its right side.
#We then proceed to look at its left side (before it) to ensure that it is
#really free standing.
if (word_match_indices[i] == len(new_character_string) - (word_length + 1) and
new_character_string[word_match_indices[i] + word_length] in
[u"\u2800", "⠤", "⠲", "⠄", "⠂", "⠒", "⠆", "⠦", "⠖", "⠴"]):
#Now looking at the characters before the "word[0]" match. If there
#are no braille characters before the start of "word[0]" and the conditions
#in the parent "if" statement are met, than the shortform word is freestanding
#and the substitution takes place.
if word_match_indices[i] == 0:
new_character_string = word[1] + new_character_string[word_match_indices[i] + word_length:]
#If there is only one braille character before the start of "word[0]",
#and that character is either an empty braille cell (u"\u2800"), a
#hyphen ("⠤"), a capitalization symbol ("⠠") or a double opening
#quote ("⠦"), then the substitution of the shortform word "word[0]"
#can take place, as "word[0]" stands alone:
elif (word_match_indices[i] == 1 and
new_character_string[word_match_indices[i]-1] in [u"\u2800", "⠤", "⠦", "⠠"]):
new_character_string = (new_character_string[:word_match_indices[i]]
+ word[1] + new_character_string[word_match_indices[i] + word_length:])
#If there are two braille characters before the start of "word[0]", and
#those characters are either an empty braille cell (u"\u2800"), hyphen
#("⠤" or dash symbols that end with "⠤", such as minus sign ("⠐⠤"),
#dash/en dash("⠠⠤") or underscore ("⠨⠤")), capitalization symbol ("⠠"),
#opening single ("⠠⠦") or double ("⠦", "⠘⠦", "⠸⠦") quotes, any
#typeform indicators for symbols, words or passages written in
#italics ("⠨⠆", "⠨⠂", "⠨⠶"), bold ("⠘⠆", "⠘⠂", "⠘⠶"),
#underline ("⠸⠆", "⠸⠂", "⠸⠶") or script ("⠈⠆", "⠈⠂", "⠈⠶"),
#opening parenthesis ("⠐⠣"), square bracket ("⠨⠣") or curly
#bracket ("⠸⠣"), then the substitution of the shortform
#word "word[0]" can take place, as "word[0]" stands alone.
#The en dash and underscore are covered in looking or the "⠤"
#character preceding the "⠠⠴" match, and so are not included
#in the list of two braille characters.
elif (word_match_indices[i] == 2 and
(new_character_string[word_match_indices[i]-2:word_match_indices[i]] in
["⠠⠦", "⠘⠦", "⠸⠦", "⠨⠆", "⠨⠂", "⠨⠶", "⠘⠆", "⠘⠂", "⠘⠶",
"⠸⠆", "⠸⠂", "⠸⠶", "⠈⠆", "⠈⠂", "⠈⠶", "⠐⠣", "⠨⠣", "⠸⠣"] or
new_character_string[word_match_indices[i]-1] in [u"\u2800", "⠤", "⠦", "⠠"])):
new_character_string = (new_character_string[:word_match_indices[i]]
+ word[1] + new_character_string[word_match_indices[i] + word_length:])
#If the start of "word[0]" is located at least three braille characters from
#the start of "new_character_string", and that word[0] is flanked either by
#an empty braille cell (u"\u2800") or a hyphen ("⠤" or dash symbols that end
#with "⠤" such as minus sign ("⠐⠤"), dash/en dash("⠠⠤"), long dash/em dash("⠐⠠⠤"),
#or underscore ("⠨⠤")), capitalization symbol ("⠠"), opening single ("⠠⠦")
#or double ("⠦", "⠘⠦", "⠸⠦") quotes, any typeform indicators for symbols,
#words or passages written in italics ("⠨⠆", "⠨⠂", "⠨⠶"), bold ("⠘⠆", "⠘⠂", "⠘⠶"),
#underline ("⠸⠆", "⠸⠂", "⠸⠶") or script ("⠈⠆", "⠈⠂", "⠈⠶"),
#opening parenthesis ("⠐⠣", "⠠⠐⠣"), square bracket ("⠨⠣", "⠠⠨⠣") or curly
#bracket ("⠸⠣", "⠠⠸⠣"), then the substitution of the shortform word "word[0]"
#can take place, as "word[0]" stands alone. The em dash, en dash and underscore
#are covered in looking for the "⠤" character preceding the "⠠⠴" match, and so
#are not included in the list of two and three braille characters.
elif (word_match_indices[i] >= 3 and
(new_character_string[word_match_indices[i]-3:word_match_indices[i]] in
["⠠⠐⠣", "⠠⠨⠣", "⠠⠸⠣"] or
new_character_string[word_match_indices[i]-2:word_match_indices[i]] in
["⠠⠦", "⠘⠦", "⠸⠦", "⠨⠆", "⠨⠂", "⠨⠶", "⠘⠆", "⠘⠂", "⠘⠶",
"⠸⠆", "⠸⠂", "⠸⠶", "⠈⠆", "⠈⠂", "⠈⠶", "⠐⠣", "⠨⠣", "⠸⠣"] or
new_character_string[word_match_indices[i]-1] in [u"\u2800", "⠤", "⠦", "⠠"])):
new_character_string = (new_character_string[:word_match_indices[i]]
+ word[1] + new_character_string[word_match_indices[i] + word_length:])
#"word_match_indices[i] == len(new_character_string) - (word_length + 2)"
#means that there are only two braille characters after the "word[0]" match.
#This is necessary, as an error would be raised if we were to look three
#characters ahead. If word[0] is flanked to the right by two braille characters
#consisting of either closing single ("⠠⠴") or double ("⠘⠴", "⠸⠴") quotes,
#closing parenthesis ("⠐⠜"), or square ("⠨⠜") or curly ("⠸⠜") brackets,
#minus sign ("⠐⠤", which some people could mistakenly use as a hyphen),
#en-dash ("⠠⠤"), underscore ("⠨⠤") or the terminators for passages or words
#written in italics ("⠨⠄"), bold ("⠘⠄"), underline ("⠸⠄") or script ("⠈⠄"),
#then then "word[0]" meets the requirements to be free standing on its right side.
#We then proceed to look at its left side (before it) to ensure that it is
#really free standing.
#Alternatively, if the character direcly after word[0] is either an empty
#braille cell (u"\u2800"), hyphen ("⠤"), period ("⠲"), apostrophe ("⠄"),
#comma ("⠂"), colon ("⠒"), semicolon ("⠆") question mark ("⠦"),
#exclamation mark ("⠖") or closing double quote ("⠴"), then "word[0]"
#meets the requirements to be free standing on its right side. We then
#proceed to look at its left side (before it) to ensure that it is
#really free standing.
elif (word_match_indices[i] == len(new_character_string) - (word_length + 2) and
(new_character_string[word_match_indices[i] + word_length:word_match_indices[i] + word_length + 2] in
["⠠⠴", "⠘⠴", "⠸⠴", "⠐⠜", "⠨⠜", "⠸⠜", "⠐⠤", "⠠⠤", "⠨⠤", "⠨⠄", "⠘⠄", "⠸⠄", "⠈⠄"] or
new_character_string[word_match_indices[i] + word_length] in
[u"\u2800", "⠤", "⠲", "⠄", "⠂", "⠒", "⠆", "⠦", "⠖", "⠴"])):
if word_match_indices[i] == 0:
new_character_string = word[1] + new_character_string[word_match_indices[i] + word_length:]
elif (word_match_indices[i] == 1 and
new_character_string[word_match_indices[i]-1] in [u"\u2800", "⠤", "⠦", "⠠"]):
new_character_string = (new_character_string[:word_match_indices[i]]
+ word[1] + new_character_string[word_match_indices[i] + word_length:])
elif (word_match_indices[i] == 2 and
(new_character_string[word_match_indices[i]-2:word_match_indices[i]] in
["⠠⠦", "⠘⠦", "⠸⠦", "⠨⠆", "⠨⠂", "⠨⠶", "⠘⠆", "⠘⠂", "⠘⠶",
"⠸⠆", "⠸⠂", "⠸⠶", "⠈⠆", "⠈⠂", "⠈⠶", "⠐⠣", "⠨⠣", "⠸⠣"] or
new_character_string[word_match_indices[i]-1] in [u"\u2800", "⠤", "⠦", "⠠"])):
new_character_string = (new_character_string[:word_match_indices[i]]
+ word[1] + new_character_string[word_match_indices[i] + word_length:])
elif (word_match_indices[i] >= 3 and
(new_character_string[word_match_indices[i]-3:word_match_indices[i]] in
["⠠⠐⠣", "⠠⠨⠣", "⠠⠸⠣"] or
new_character_string[word_match_indices[i]-2:word_match_indices[i]] in
["⠠⠦", "⠘⠦", "⠸⠦", "⠨⠆", "⠨⠂", "⠨⠶", "⠘⠆", "⠘⠂", "⠘⠶",
"⠸⠆", "⠸⠂", "⠸⠶", "⠈⠆", "⠈⠂", "⠈⠶", "⠐⠣", "⠨⠣", "⠸⠣"] or
new_character_string[word_match_indices[i]-1] in [u"\u2800", "⠤", "⠦", "⠠"])):
new_character_string = (new_character_string[:word_match_indices[i]]
+ word[1] + new_character_string[word_match_indices[i] + word_length:])
#Looking at up to three braille cells following the "word[0]" match, hence the
#"word_match_indices[i] <= len(new_character_string) - (word_length +3)".
#If word[0] is flanked to the right by three braille characters making up either
#a multi-line closing parenthesis ("⠠⠐⠜"), square ("⠠⠨⠜") or curly ("⠠⠸⠜") bracket or
#an em-dash ("⠐⠠⠤"), then then "word[0]" meets the requirements to be free standing
#on its right side. We then proceed to look at its left side (before it) to ensure
#that it is really free standing.
#On the other hand, if word[0] is flanked to the right by two braille characters
#consisting of either closing single ("⠠⠴") or double ("⠘⠴", "⠸⠴") quotes,
#closing parenthesis ("⠐⠜"), or square ("⠨⠜") or curly ("⠸⠜") brackets,