Skip to content

Commit

Permalink
Fix that regex optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
kbourgoin committed Jul 18, 2013
1 parent 58cfb20 commit cf8a8e7
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 8 deletions.
10 changes: 6 additions & 4 deletions serpextract/serpextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,16 @@ def _get_lossy_domain(domain):
r'(?:w+\d*\.|search\.|m\.)*' + # www. www1. search. m.
r'((?P<ccsub>{})\.)?'.format(codes) + # country-code subdomain
r'(?P<domain>.*?)' + # domain
r'(\.(?P<tld>com|org|net|co|it|edu))?' + # tld
r'(\.(?P<tldcc>{}))?'.format(codes) + # country-code tld
r'(?P<tld>\.(com|org|net|co|it|edu))?' + # tld
r'(?P<tldcc>\.({}))?'.format(codes) + # country-code tld
r'$') # all done

res = _get_lossy_domain_regex.match(domain).groupdict()
output = '%s%s.{}' % ('{}.' if res['ccsub'] else '', domain)
output = '%s%s%s' % ('{}.' if res['ccsub'] else '',
res['domain'],
'.{}' if res['tldcc'] else res['tld'] or '')
_domain_cache[domain] = output # Add to LRU cache
return domain
return output


class ExtractResult(object):
Expand Down
5 changes: 1 addition & 4 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ def test_get_lossy_domain(self):
url = 'search.a.com'
self.assertEqual(get_lossy_domain(url), expected)

url = 'blah.m.a.com'
self.assertEqual(get_lossy_domain(url), 'blah.a.com')

url = 'a.co.uk'
self.assertEqual(get_lossy_domain(url), 'a.{}')

Expand All @@ -61,4 +58,4 @@ def test_get_lossy_domain(self):


if __name__ == '__main__':
unittest.main()
unittest.main()

0 comments on commit cf8a8e7

Please sign in to comment.