Skip to content

Commit

Permalink
improve normalizer by making split_camelcase obligatory and done both…
Browse files Browse the repository at this point in the history
… sides of the local/remote divide to match
  • Loading branch information
i30817 committed Jun 4, 2023
1 parent 522a4b9 commit 231e385
Showing 1 changed file with 12 additions and 15 deletions.
27 changes: 12 additions & 15 deletions libretrofuzz/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,8 @@ def replacemany(our_str, to_be_replaced, replace_with):
our_str = our_str.replace(nextchar, replace_with)
return our_str

def if_not_spaced_split_camelcase(name: str):
"""if the name is a no-space string, split the camelcase, if any"""
if ' ' not in name:
name = ' '.join([s for s in re.split('([A-Z][^A-Z]*)', name) if s])
return name
def split_camelcase(name: str):
return ' '.join([s.strip() for s in re.split('([A-Z][^A-Z]*)', name) if s])

def removefirst(name: str, suf: str):
return name.replace(suf, '', 1)
Expand Down Expand Up @@ -233,6 +230,9 @@ def normalizer(t, nometa, hack):
#strips just because the user may have made a mistake naming the source
#(or the replacement above introduce boundary spaces)
t = t.strip()
#CamelCaseNames for local labels are common when there are no spaces,
#do this to normalize definite articles in normalization with spaces only (minimizes changes)
t = split_camelcase(t)
#normalize case
t = t.lower()
#beginning and end definite articles in several european languages (people move them)
Expand Down Expand Up @@ -270,6 +270,12 @@ def normalizer(t, nometa, hack):
t = removeprefix(t, 'a ')
#remove the symbols used in the definite article normalization
t = replacemany(t, ',\'', '')
#this makes sure that if a remote name has ' and ' instead of ' _ ' to replace ' & ' it works
#': ' doesn't need this because ':' is a forbidden character and both '_' and '-' turn to ''
t = t.replace(' and ', '')
#although all names have spaces (now), the local names may have weird spaces,
#so to equalize them after the space dependent checks (this also strips)
t = ''.join(t.split())
#Tries to make roman numerals in the range 1-20 equivalent to normal numbers (to handle names that change it).
#If both sides are roman numerals there is no harm done if XXIV gets turned into 204 in both sides.
t = t.replace('xviii', '18')
Expand All @@ -292,12 +298,6 @@ def normalizer(t, nometa, hack):
t = t.replace('ix', '9')
t = t.replace('x', '10')
t = t.replace('i', '1')
#this makes sure that if a remote name has ' and ' instead of ' _ ' to replace ' & ' it works
#': ' doesn't need this because ':' is a forbidden character and both '_' and '-' turn to ''
t = t.replace(' and ', '')
#although all names have spaces (now), the local names may have weird spaces,
#so to equalize them after the space dependent checks (this also strips)
t = ''.join(t.split())
#remove diacritics (does nothing to asian languages diacritics, only for 2 to 1 character combinations)
t = u''.join([c for c in unicodedata.normalize('NFKD', t) if not unicodedata.combining(c)])
return t
Expand Down Expand Up @@ -655,10 +655,7 @@ async def downloader(names: [(str,str)],
#only the local names should have forbidden characters
name = re.sub(forbidden, '_', name )
nameaux = re.sub(forbidden, '_', nameaux )

#CamelCaseNames for local labels are common when there are no spaces,
#do this to normalize definite articles in normalization with spaces only (minimizes changes)
nameaux = if_not_spaced_split_camelcase(nameaux)

#unlike the server thumbnails, normalization wasn't done yet
nameaux = norm(nameaux, nometa, hack)

Expand Down

0 comments on commit 231e385

Please sign in to comment.