From 0445961cdbc7535c4f1d1b4c87a2ccbebe51f123 Mon Sep 17 00:00:00 2001 From: Matthieu Caneill Date: Sun, 2 May 2021 00:39:57 +0200 Subject: [PATCH] fix listing and viewing filenames with non-utf8 chars Any folder containing a least one file whose filename has characters that can be decoded to utf-8 can't be listed; additionnally these files can't be viewed. Examples: - https://sources.debian.org/src/debian-maintainers/1.52/debian-maintainers/ - https://sources.debian.org/src/cvsnt/2.5.03.2382-3.3+lenny1/ - https://sources.debian.org/src/aspell-is/0.51-0-4/ Fix this, and use the latest testdata/ commit to ensure it doesn't break again (it adds package aspell-is which contains such a file, see last example above). --- contrib/docker/config.ini | 1 + lib/debsources/app/sources/views.py | 7 ++++-- lib/debsources/navigation.py | 21 +++++++++-------- lib/debsources/query.py | 3 ++- lib/debsources/tests/test_fs_storage.py | 4 ++-- lib/debsources/tests/test_queries.py | 4 ++-- lib/debsources/tests/test_stats.py | 30 ++++++++++++------------ lib/debsources/tests/test_updater.py | 12 +++++----- lib/debsources/tests/test_url.py | 9 +++++++ lib/debsources/tests/test_web_cp.py | 2 +- lib/debsources/tests/test_web_patches.py | 2 +- lib/debsources/tests/test_webapp.py | 22 +++++++++++++---- lib/debsources/url.py | 17 ++++++++++++++ testdata | 2 +- 14 files changed, 91 insertions(+), 45 deletions(-) create mode 100644 lib/debsources/tests/test_url.py create mode 100644 lib/debsources/url.py diff --git a/contrib/docker/config.ini b/contrib/docker/config.ini index f26f2bf3..73166ab4 100644 --- a/contrib/docker/config.ini +++ b/contrib/docker/config.ini @@ -12,6 +12,7 @@ local_dir: %(root_dir)s/local sources_dir: %(root_dir)s/testdata/sources python_dir: %(root_dir)s/python mirror_dir: %(root_dir)s/testdata/mirror +mirror_archive_dir: %(root_dir)s/testdata/archive pool_dir: %(mirror_dir)s/pool dry_run: false # echoes or not the SQL requests to stdout (can be logged with Apache): diff --git a/lib/debsources/app/sources/views.py b/lib/debsources/app/sources/views.py index 2db3d615..5e7a052f 100644 --- a/lib/debsources/app/sources/views.py +++ b/lib/debsources/app/sources/views.py @@ -25,6 +25,7 @@ from debsources.navigation import (Location, Directory, SourceFile) +from debsources.url import url_decode, url_encode import debsources.query as qry from ..views import GeneralView, app, session from ..extract_stats import extract_stats @@ -169,7 +170,7 @@ def _render_file(self, location): package=location.get_package(), version=location.get_version(), mime=file_.get_mime(), - raw_url=str(raw_url), + raw_url=raw_url, path=str(path), text_file=text_file, stat=qry.location_get_stat(location.sources_path), @@ -216,7 +217,7 @@ def _render_file(self, location): code=sourcefile) return dict(type="file", - file=location.get_deepest_element(), + file=url_encode(location.get_deepest_element()), package=location.get_package(), version=location.get_version(), mime=file_.get_mime(), @@ -237,6 +238,8 @@ def get_objects(self, path_to): Directory: we want the subdirs and subfiles (disk listing) File: we want to render the raw url of the file """ + path_to = url_decode(path_to) + package, version, *path = path_to.split('/') path = Path('/'.join(path)) diff --git a/lib/debsources/navigation.py b/lib/debsources/navigation.py index 899a3970..40d56c26 100644 --- a/lib/debsources/navigation.py +++ b/lib/debsources/navigation.py @@ -20,6 +20,7 @@ from debsources.models import (Checksum, File, Package, PackageName) from debsources import filetype +from debsources.url import url_encode from debsources.consts import AREAS from debsources.debmirror import SourcePackage from debsources.excepts import FileOrFolderNotFound, \ @@ -138,15 +139,15 @@ def get_type(f): else: return "file" - listing = sorted( - (dict( - name=f.name, - type=get_type(f), - hidden=False, - stat=qry.location_get_stat(self.sources_path / f) - ) - for f in Path.iterdir(self.sources_path)), - key=lambda x: x['name']) + listing = [ + { + 'name': url_encode(f.name), + 'type': get_type(f), + 'hidden': False, + 'stat': qry.location_get_stat(self.sources_path / f) + } + for f in sorted(Path.iterdir(self.sources_path)) + ] for hidden_file in self.hidden_files: for f in listing: @@ -214,4 +215,4 @@ def istextfile(self): def get_raw_url(self): """ return the raw url on disk (e.g. data/main/a/azerty/foo.bar) """ - return self.sources_path_static + return url_encode(str(self.sources_path_static)) diff --git a/lib/debsources/query.py b/lib/debsources/query.py index 5ceb9bcc..342f8321 100644 --- a/lib/debsources/query.py +++ b/lib/debsources/query.py @@ -20,6 +20,7 @@ from collections import namedtuple from debian.debian_support import version_compare +from debsources.url import url_encode from debsources.consts import PREFIXES_DEFAULT from debsources.consts import SUITES from debsources.excepts import InvalidPackageOrVersionError @@ -147,7 +148,7 @@ def location_get_path_links(endpoint, path_to: Path): returns the path hierarchy with urls, to use with 'You are here:' [(name, url(name)), (...), ...] """ - path_dict = path_to.parts + path_dict = [url_encode(x) for x in path_to.parts] pathl = [] # we import flask here, in order to permit the use of this module diff --git a/lib/debsources/tests/test_fs_storage.py b/lib/debsources/tests/test_fs_storage.py index 8869a953..d3112255 100644 --- a/lib/debsources/tests/test_fs_storage.py +++ b/lib/debsources/tests/test_fs_storage.py @@ -31,14 +31,14 @@ class FsStorageTests(unittest.TestCase): @istest def assertWalkLength(self): self.assertEqual(len([f for f in walk(make_path(''))]), - 261) + 268) @istest def assertWalkTestChecksums(self): self.assertEqual( len([f for f in walk(make_path(''), test=lambda x: 'checksums' in str(x))]), - 36) + 37) @istest def parsePathDir(self): diff --git a/lib/debsources/tests/test_queries.py b/lib/debsources/tests/test_queries.py index f32d5350..48fed2d7 100644 --- a/lib/debsources/tests/test_queries.py +++ b/lib/debsources/tests/test_queries.py @@ -50,7 +50,7 @@ def tearDownClass(cls): def test_packages_prefixes(self): self.assertEqual(qry.pkg_names_get_packages_prefixes( self.app_wrapper.app.config["CACHE_DIR"]), - ['b', 'c', 'd', 'f', 'g', 'l', 'libc', 'm', + ['a', 'b', 'c', 'd', 'f', 'g', 'l', 'libc', 'm', 'n', 'o', 'p', 's', 'u']) def test_list_versions(self): @@ -104,5 +104,5 @@ def test_ratio(self): # overall self.assertEqual(qry.get_ratio(self.session), 77) # per suite - self.assertEqual(qry.get_ratio(self.session, 'jessie'), 50) + self.assertEqual(qry.get_ratio(self.session, 'jessie'), 51) self.assertEqual(qry.get_ratio(self.session, 'squeeze'), 100) diff --git a/lib/debsources/tests/test_stats.py b/lib/debsources/tests/test_stats.py index 63ffb361..1478ccd8 100644 --- a/lib/debsources/tests/test_stats.py +++ b/lib/debsources/tests/test_stats.py @@ -47,11 +47,11 @@ def diskUsagesMatchReferenceDb(self): sizes = { 'squeeze': 44316, 'wheezy': 39688, - 'jessie': 50528, + 'jessie': 51428, 'sid': 54456, 'experimental': 12964, } - total_size = 180728 + total_size = 181628 self.assertSuiteCountsEqual(sizes, statistics.disk_usage) self.assertEqual(total_size, statistics.disk_usage(self.session)) @@ -60,11 +60,11 @@ def sourcePackagesCountsMatchReferenceDb(self): source_packages = { 'squeeze': 13, 'wheezy': 12, - 'jessie': 13, + 'jessie': 14, 'sid': 14, 'experimental': 2, } - total_source_packages = 36 + total_source_packages = 37 self.assertSuiteCountsEqual(source_packages, statistics.source_packages) self.assertEqual(total_source_packages, @@ -75,11 +75,11 @@ def sourceFilesCountsMatchReferenceDb(self): source_files = { 'squeeze': 2024, 'wheezy': 1632, - 'jessie': 2038, + 'jessie': 2059, 'sid': 2613, 'experimental': 1396, } - total_files = 9333 + total_files = 9354 self.assertSuiteCountsEqual(source_files, statistics.source_files) self.assertEqual(total_files, statistics.source_files(self.session)) @@ -94,14 +94,14 @@ def slocCountsMatchReferenceDb(self): 'java': 916, 'lex': 223, 'lisp': 2193, - 'makefile': 2092, + 'makefile': 2104, 'ml': 5044, 'objc': 836, 'perl': 1199, 'python': 2916, 'ruby': 193, 'sed': 16, - 'sh': 29984, + 'sh': 30045, 'sql': 237, 'xml': 14932, 'yacc': 312, @@ -122,11 +122,11 @@ def ctagsCountsMatchReferenceDb(self): ctags = { 'squeeze': 31015, 'wheezy': 20521, - 'jessie': 23815, + 'jessie': 23816, 'sid': 28723, 'experimental': 17284, } - total_ctags = 116832 + total_ctags = 116833 self.assertSuiteCountsEqual(ctags, statistics.ctags) self.assertEqual(total_ctags, statistics.ctags(self.session)) @@ -149,15 +149,15 @@ def slocPerPkgMatchReferenceDb(self): @istest def areaFiltersMatchReferenceDb(self): self.assertEqual(statistics.disk_usage(self.session), - 180728) + 181628) self.assertEqual(statistics.disk_usage(self.session, areas=['main']), - 155172) + 156072) self.assertEqual(statistics.disk_usage(self.session, suite='wheezy', areas=['main']), 35824) area_count = statistics.source_packages(self.session, areas=['main']) - self.assertEqual(area_count, 17) + self.assertEqual(area_count, 18) self.assertLessEqual(area_count, statistics.source_packages(self.session)) @@ -172,7 +172,7 @@ def areaFiltersMatchReferenceDb(self): statistics.sloccount_lang(self.session, 'ansic')) area_count = statistics.ctags(self.session, areas=['main']) - self.assertEqual(area_count, 88250) + self.assertEqual(area_count, 88251) self.assertLessEqual(area_count, statistics.ctags(self.session)) @istest @@ -198,7 +198,7 @@ def test_group_by_stats(self): stats = dict(statistics.stats_grouped_by(self.session, 'source_packages')) - self.assertEqual(stats['jessie'], 13) + self.assertEqual(stats['jessie'], 14) stats = dict(statistics.stats_grouped_by(self.session, 'source_files')) self.assertEqual(stats['wheezy'], 1632) diff --git a/lib/debsources/tests/test_updater.py b/lib/debsources/tests/test_updater.py index 2a8482c0..4ba3a8f2 100644 --- a/lib/debsources/tests/test_updater.py +++ b/lib/debsources/tests/test_updater.py @@ -280,20 +280,20 @@ def tearDown(self): @istest def sizeMatchesReferenceDb(self): - EXPECTED_SIZE = 180728 + EXPECTED_SIZE = 181628 self.assertEqual(EXPECTED_SIZE, self.stats['total.disk_usage']) @istest def statsMatchReferenceDb(self): expected_stats = { # just a few samples - 'total.ctags': 116832, + 'total.ctags': 116833, 'debian_sid.ctags': 28723, 'debian_squeeze.ctags': 31015, 'debian_experimental.disk_usage': 12964, - 'total.source_files': 9333, + 'total.source_files': 9354, 'debian_experimental.source_files': 1396, - 'debian_jessie.source_files': 2038, - 'total.source_packages': 36, + 'debian_jessie.source_files': 2059, + 'total.source_packages': 37, 'debian_squeeze.source_packages': 13, 'debian_wheezy.source_packages': 12, 'debian_sid.sloccount.awk': 25, @@ -305,7 +305,7 @@ def statsMatchReferenceDb(self): 'debian_wheezy.sloccount.python': 2798, 'debian_squeeze.sloccount.ruby': 193, 'debian_wheezy.sloccount.ruby': 193, - 'total.sloccount': 1250830, + 'total.sloccount': 1250903, 'total.sloccount.javascript': 212, 'debian_squeeze.sloccount': 315750, } diff --git a/lib/debsources/tests/test_url.py b/lib/debsources/tests/test_url.py new file mode 100644 index 00000000..76b0e4d3 --- /dev/null +++ b/lib/debsources/tests/test_url.py @@ -0,0 +1,9 @@ +from debsources.url import url_decode, url_encode + + +def test_url_encode(): + assert url_encode("hello\udced") == "hello%ED" + + +def test_url_decode(): + assert url_decode("hello%ED") == "hello\udced" diff --git a/lib/debsources/tests/test_web_cp.py b/lib/debsources/tests/test_web_cp.py index dcbe4bba..529aa1b6 100644 --- a/lib/debsources/tests/test_web_cp.py +++ b/lib/debsources/tests/test_web_cp.py @@ -31,7 +31,7 @@ def test_api_packages_list(self): rv = json.loads( self.app.get('/copyright/api/list/').data) self.assertIn({'name': "ocaml-curses"}, rv['packages']) - self.assertEqual(len(rv['packages']), 18) + self.assertEqual(len(rv['packages']), 19) def test_api_by_prefix(self): rv = json.loads( diff --git a/lib/debsources/tests/test_web_patches.py b/lib/debsources/tests/test_web_patches.py index e28e27d3..f9296cd5 100644 --- a/lib/debsources/tests/test_web_patches.py +++ b/lib/debsources/tests/test_web_patches.py @@ -31,7 +31,7 @@ def test_api_packages_list(self): rv = json.loads( self.app.get('/patches/api/list/').data) self.assertIn({'name': "ocaml-curses"}, rv['packages']) - self.assertEqual(len(rv['packages']), 18) + self.assertEqual(len(rv['packages']), 19) def test_api_by_prefix(self): rv = json.loads( diff --git a/lib/debsources/tests/test_webapp.py b/lib/debsources/tests/test_webapp.py index a04f860b..46af65e5 100644 --- a/lib/debsources/tests/test_webapp.py +++ b/lib/debsources/tests/test_webapp.py @@ -184,7 +184,7 @@ def test_static_pages(self): def test_api_packages_list(self): rv = json.loads(self.app.get('/api/list/').data) self.assertIn({'name': "libcaca"}, rv['packages']) - self.assertEqual(len(rv['packages']), 18) + self.assertEqual(len(rv['packages']), 19) def test_api_by_prefix(self): rv = json.loads(self.app.get('/api/prefix/libc/').data) @@ -554,9 +554,9 @@ def test_info_version(self): def test_api_stats_suite(self): rv = json.loads(self.app.get('/api/stats/jessie/').data) self.assertEqual(rv["suite"], "jessie") - self.assertEqual(rv["results"]["debian_jessie.ctags"], 23815) - self.assertEqual(rv["results"]["debian_jessie.disk_usage"], 50528) - self.assertEqual(rv["results"]["debian_jessie.source_files"], 2038) + self.assertEqual(rv["results"]["debian_jessie.ctags"], 23816) + self.assertEqual(rv["results"]["debian_jessie.disk_usage"], 51428) + self.assertEqual(rv["results"]["debian_jessie.source_files"], 2059) self.assertEqual(rv["results"]["debian_jessie.sloccount.python"], 2916) def test_api_released_suite(self): @@ -621,5 +621,19 @@ def test_news(self): rv = self.app.get(news_routes[news_file]) self.assertIn(news_string, rv.data.decode()) + def test_non_utf8_filename(self): + # List folder containing a non-utf8 filename. + rv = self.app.get('/src/aspell-is/0.51-0-4/') + self.assertEqual(200, rv.status_code) + self.assertIn( + b'%EDslenska.alias', + rv.data + ) + # Visit that file. + rv = self.app.get('/src/aspell-is/0.51-0-4/%25EDslenska.alias/') + self.assertEqual(200, rv.status_code) + self.assertIn("

File: %EDslenska.alias

", rv.data) + + if __name__ == '__main__': unittest.main(exit=False) diff --git a/lib/debsources/url.py b/lib/debsources/url.py new file mode 100644 index 00000000..502040d6 --- /dev/null +++ b/lib/debsources/url.py @@ -0,0 +1,17 @@ +import urllib.parse + + +def url_encode(name: str) -> str: + """Percent-encode a surrogate-escaped string for use in URIs. + + E.g. hello\udced -> hello%ED + """ + return urllib.parse.quote(bytes(name, 'utf8', 'surrogateescape')) + + +def url_decode(url: str) -> str: + """Percent-decode an URI with byte characters into a surrogate-escaped string. + + E.g. hello%ED -> hello\udced + """ + return urllib.parse.unquote(url, 'utf8', 'surrogateescape') diff --git a/testdata b/testdata index 370aceec..bd9a96d4 160000 --- a/testdata +++ b/testdata @@ -1 +1 @@ -Subproject commit 370aceecfc43ce4106e0eac2ed6257983181d5c0 +Subproject commit bd9a96d48162b8d482c35f8780f6ebb1098c224c