Skip to content

Commit

Permalink
also make the definite article removal case insensitive, and remove u…
Browse files Browse the repository at this point in the history
…seless repeat of roman numeral case removal by just moving the lowercasing up
  • Loading branch information
i30817 committed Jun 3, 2023
1 parent 74fdd86 commit 4894ea4
Showing 1 changed file with 35 additions and 55 deletions.
90 changes: 35 additions & 55 deletions libretrofuzz/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,85 +233,65 @@ def normalizer(t, nometa, hack):
#strips just because the user may have made a mistake naming the source
#(or the replacement above introduce boundary spaces)
t = t.strip()
#normalize case
t = t.lower()
#beginning and end definite articles in several european languages (people move them)
#make sure we're only removing the capitalized start and end forms with spaces
t = removefirst(t, ', The')
t = removeprefix(t, 'The ')
t = removefirst(t, ', Los')
t = removeprefix(t, 'Los ')
t = removefirst(t, ', Las')
t = removeprefix(t, 'Las ')
t = removefirst(t, ', Les')
t = removeprefix(t, 'Les ')
t = removefirst(t, ', Le')
t = removeprefix(t, 'Le ')
t = removefirst(t, ', La')
t = removeprefix(t, 'La ')
t = removefirst(t, ', L\'')
#make sure we're only removing the start and end forms with spaces
t = removefirst(t, ', the')
t = removeprefix(t, 'the ')
t = removefirst(t, ', los')
t = removeprefix(t, 'los ')
t = removefirst(t, ', las')
t = removeprefix(t, 'las ')
t = removefirst(t, ', les')
t = removeprefix(t, 'les ')
t = removefirst(t, ', le')
t = removeprefix(t, 'le ')
t = removefirst(t, ', la')
t = removeprefix(t, 'la ')
t = removefirst(t, ', l\'')
#L' sometimes ommits the space so always remove L' at the start even without space
t = removeprefix(t, 'L\'') #if there is a extra space the next join will remove it
t = removefirst(t, ', Der')
t = removeprefix(t, 'Der ')
t = removefirst(t, ', Die')
t = removeprefix(t, 'Die ')
t = removefirst(t, ', Das')
t = removeprefix(t, 'Das ')
t = removefirst(t, ', El')
t = removeprefix(t, 'El ')
t = removefirst(t, ', Os')
t = removeprefix(t, 'Os ')
t = removefirst(t, ', As')
t = removeprefix(t, 'As ')
t = removefirst(t, ', O')
t = removeprefix(t, 'O ')
t = removefirst(t, ', A')
t = removeprefix(t, 'A ')
t = removeprefix(t, 'l\'') #if there is a extra space the next join will remove it
t = removefirst(t, ', der')
t = removeprefix(t, 'der ')
t = removefirst(t, ', die')
t = removeprefix(t, 'die ')
t = removefirst(t, ', das')
t = removeprefix(t, 'das ')
t = removefirst(t, ', el')
t = removeprefix(t, 'el ')
t = removefirst(t, ', os')
t = removeprefix(t, 'os ')
t = removefirst(t, ', as')
t = removeprefix(t, 'as ')
t = removefirst(t, ', o')
t = removeprefix(t, 'o ')
t = removefirst(t, ', a')
t = removeprefix(t, 'a ')
#remove the symbols used in the definite article normalization
t = replacemany(t, ',\'', '')
#Tries to make roman numerals in the range 1-20 equivalent to normal numbers (to handle names that change it).
#If both sides are roman numerals there is no harm done if XXIV gets turned into 204 in both sides.
t = t.replace('XVIII', '18')
t = t.replace('xviii', '18')
t = t.replace('XVII', '17')
t = t.replace('xvii', '17')
t = t.replace('XVI' , '16')
t = t.replace('xvi' , '16')
t = t.replace('XIII', '13')
t = t.replace('xiii', '13')
t = t.replace('XII' , '12')
t = t.replace('xii' , '12')
t = t.replace('XIV' , '14')
t = t.replace('xiv' , '14')
t = t.replace('XV' , '15')
t = t.replace('xv' , '15')
t = t.replace('XIX', '19')
t = t.replace('xix', '19')
t = t.replace('XX', '20')
t = t.replace('xx', '20')
t = t.replace('XI', '11')
t = t.replace('xi', '11')
t = t.replace('VIII', '8')
t = t.replace('viii', '8')
t = t.replace('VII', '7')
t = t.replace('vii', '7')
t = t.replace('VI' , '6')
t = t.replace('vi' , '6')
t = t.replace('III', '3')
t = t.replace('iii', '3')
t = t.replace('II' , '2')
t = t.replace('ii' , '2')
t = t.replace('IV' , '4')
t = t.replace('iv' , '4')
t = t.replace('V' , '5')
t = t.replace('v' , '5')
t = t.replace('IX', '9')
t = t.replace('ix', '9')
t = t.replace('X', '10')
t = t.replace('x', '10')
t = t.replace('I', '1')
t = t.replace('i', '1')
#remove the symbols used in the definite article normalization
t = replacemany(t, ',\'', '')
#normalize case
t = t.lower()
#this makes sure that if a remote name has ' and ' instead of ' _ ' to replace ' & ' it works
#': ' doesn't need this because ':' is a forbidden character and both '_' and '-' turn to ''
t = t.replace(' and ', '')
Expand Down

0 comments on commit 4894ea4

Please sign in to comment.