Skip to content

Commit

Permalink
fix listing and viewing filenames with non-utf8 chars
Browse files Browse the repository at this point in the history
Any folder containing a least one file whose filename has characters
that can be decoded to utf-8 can't be listed; additionnally these
files can't be viewed.

Examples:
- https://sources.debian.org/src/debian-maintainers/1.52/debian-maintainers/
- https://sources.debian.org/src/cvsnt/2.5.03.2382-3.3+lenny1/
- https://sources.debian.org/src/aspell-is/0.51-0-4/

Fix this, and use the latest testdata/ commit to ensure it doesn't
break again (it adds package aspell-is which contains such a file, see
last example above).
  • Loading branch information
matthieucan committed May 1, 2021
1 parent a18c72f commit 0445961
Show file tree
Hide file tree
Showing 14 changed files with 91 additions and 45 deletions.
1 change: 1 addition & 0 deletions contrib/docker/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ local_dir: %(root_dir)s/local
sources_dir: %(root_dir)s/testdata/sources
python_dir: %(root_dir)s/python
mirror_dir: %(root_dir)s/testdata/mirror
mirror_archive_dir: %(root_dir)s/testdata/archive
pool_dir: %(mirror_dir)s/pool
dry_run: false
# echoes or not the SQL requests to stdout (can be logged with Apache):
Expand Down
7 changes: 5 additions & 2 deletions lib/debsources/app/sources/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from debsources.navigation import (Location, Directory,
SourceFile)

from debsources.url import url_decode, url_encode
import debsources.query as qry
from ..views import GeneralView, app, session
from ..extract_stats import extract_stats
Expand Down Expand Up @@ -169,7 +170,7 @@ def _render_file(self, location):
package=location.get_package(),
version=location.get_version(),
mime=file_.get_mime(),
raw_url=str(raw_url),
raw_url=raw_url,
path=str(path),
text_file=text_file,
stat=qry.location_get_stat(location.sources_path),
Expand Down Expand Up @@ -216,7 +217,7 @@ def _render_file(self, location):
code=sourcefile)

return dict(type="file",
file=location.get_deepest_element(),
file=url_encode(location.get_deepest_element()),
package=location.get_package(),
version=location.get_version(),
mime=file_.get_mime(),
Expand All @@ -237,6 +238,8 @@ def get_objects(self, path_to):
Directory: we want the subdirs and subfiles (disk listing)
File: we want to render the raw url of the file
"""
path_to = url_decode(path_to)

package, version, *path = path_to.split('/')
path = Path('/'.join(path))

Expand Down
21 changes: 11 additions & 10 deletions lib/debsources/navigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from debsources.models import (Checksum, File,
Package, PackageName)
from debsources import filetype
from debsources.url import url_encode
from debsources.consts import AREAS
from debsources.debmirror import SourcePackage
from debsources.excepts import FileOrFolderNotFound, \
Expand Down Expand Up @@ -138,15 +139,15 @@ def get_type(f):
else:
return "file"

listing = sorted(
(dict(
name=f.name,
type=get_type(f),
hidden=False,
stat=qry.location_get_stat(self.sources_path / f)
)
for f in Path.iterdir(self.sources_path)),
key=lambda x: x['name'])
listing = [
{
'name': url_encode(f.name),
'type': get_type(f),
'hidden': False,
'stat': qry.location_get_stat(self.sources_path / f)
}
for f in sorted(Path.iterdir(self.sources_path))
]

for hidden_file in self.hidden_files:
for f in listing:
Expand Down Expand Up @@ -214,4 +215,4 @@ def istextfile(self):

def get_raw_url(self):
""" return the raw url on disk (e.g. data/main/a/azerty/foo.bar) """
return self.sources_path_static
return url_encode(str(self.sources_path_static))
3 changes: 2 additions & 1 deletion lib/debsources/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from collections import namedtuple

from debian.debian_support import version_compare
from debsources.url import url_encode
from debsources.consts import PREFIXES_DEFAULT
from debsources.consts import SUITES
from debsources.excepts import InvalidPackageOrVersionError
Expand Down Expand Up @@ -147,7 +148,7 @@ def location_get_path_links(endpoint, path_to: Path):
returns the path hierarchy with urls, to use with 'You are here:'
[(name, url(name)), (...), ...]
"""
path_dict = path_to.parts
path_dict = [url_encode(x) for x in path_to.parts]
pathl = []

# we import flask here, in order to permit the use of this module
Expand Down
4 changes: 2 additions & 2 deletions lib/debsources/tests/test_fs_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ class FsStorageTests(unittest.TestCase):
@istest
def assertWalkLength(self):
self.assertEqual(len([f for f in walk(make_path(''))]),
261)
268)

@istest
def assertWalkTestChecksums(self):
self.assertEqual(
len([f for f in walk(make_path(''),
test=lambda x: 'checksums' in str(x))]),
36)
37)

@istest
def parsePathDir(self):
Expand Down
4 changes: 2 additions & 2 deletions lib/debsources/tests/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def tearDownClass(cls):
def test_packages_prefixes(self):
self.assertEqual(qry.pkg_names_get_packages_prefixes(
self.app_wrapper.app.config["CACHE_DIR"]),
['b', 'c', 'd', 'f', 'g', 'l', 'libc', 'm',
['a', 'b', 'c', 'd', 'f', 'g', 'l', 'libc', 'm',
'n', 'o', 'p', 's', 'u'])

def test_list_versions(self):
Expand Down Expand Up @@ -104,5 +104,5 @@ def test_ratio(self):
# overall
self.assertEqual(qry.get_ratio(self.session), 77)
# per suite
self.assertEqual(qry.get_ratio(self.session, 'jessie'), 50)
self.assertEqual(qry.get_ratio(self.session, 'jessie'), 51)
self.assertEqual(qry.get_ratio(self.session, 'squeeze'), 100)
30 changes: 15 additions & 15 deletions lib/debsources/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ def diskUsagesMatchReferenceDb(self):
sizes = {
'squeeze': 44316,
'wheezy': 39688,
'jessie': 50528,
'jessie': 51428,
'sid': 54456,
'experimental': 12964,
}
total_size = 180728
total_size = 181628
self.assertSuiteCountsEqual(sizes, statistics.disk_usage)
self.assertEqual(total_size, statistics.disk_usage(self.session))

Expand All @@ -60,11 +60,11 @@ def sourcePackagesCountsMatchReferenceDb(self):
source_packages = {
'squeeze': 13,
'wheezy': 12,
'jessie': 13,
'jessie': 14,
'sid': 14,
'experimental': 2,
}
total_source_packages = 36
total_source_packages = 37
self.assertSuiteCountsEqual(source_packages,
statistics.source_packages)
self.assertEqual(total_source_packages,
Expand All @@ -75,11 +75,11 @@ def sourceFilesCountsMatchReferenceDb(self):
source_files = {
'squeeze': 2024,
'wheezy': 1632,
'jessie': 2038,
'jessie': 2059,
'sid': 2613,
'experimental': 1396,
}
total_files = 9333
total_files = 9354
self.assertSuiteCountsEqual(source_files, statistics.source_files)
self.assertEqual(total_files, statistics.source_files(self.session))

Expand All @@ -94,14 +94,14 @@ def slocCountsMatchReferenceDb(self):
'java': 916,
'lex': 223,
'lisp': 2193,
'makefile': 2092,
'makefile': 2104,
'ml': 5044,
'objc': 836,
'perl': 1199,
'python': 2916,
'ruby': 193,
'sed': 16,
'sh': 29984,
'sh': 30045,
'sql': 237,
'xml': 14932,
'yacc': 312,
Expand All @@ -122,11 +122,11 @@ def ctagsCountsMatchReferenceDb(self):
ctags = {
'squeeze': 31015,
'wheezy': 20521,
'jessie': 23815,
'jessie': 23816,
'sid': 28723,
'experimental': 17284,
}
total_ctags = 116832
total_ctags = 116833
self.assertSuiteCountsEqual(ctags, statistics.ctags)
self.assertEqual(total_ctags, statistics.ctags(self.session))

Expand All @@ -149,15 +149,15 @@ def slocPerPkgMatchReferenceDb(self):
@istest
def areaFiltersMatchReferenceDb(self):
self.assertEqual(statistics.disk_usage(self.session),
180728)
181628)
self.assertEqual(statistics.disk_usage(self.session, areas=['main']),
155172)
156072)
self.assertEqual(statistics.disk_usage(self.session,
suite='wheezy', areas=['main']),
35824)

area_count = statistics.source_packages(self.session, areas=['main'])
self.assertEqual(area_count, 17)
self.assertEqual(area_count, 18)
self.assertLessEqual(area_count,
statistics.source_packages(self.session))

Expand All @@ -172,7 +172,7 @@ def areaFiltersMatchReferenceDb(self):
statistics.sloccount_lang(self.session, 'ansic'))

area_count = statistics.ctags(self.session, areas=['main'])
self.assertEqual(area_count, 88250)
self.assertEqual(area_count, 88251)
self.assertLessEqual(area_count, statistics.ctags(self.session))

@istest
Expand All @@ -198,7 +198,7 @@ def test_group_by_stats(self):

stats = dict(statistics.stats_grouped_by(self.session,
'source_packages'))
self.assertEqual(stats['jessie'], 13)
self.assertEqual(stats['jessie'], 14)

stats = dict(statistics.stats_grouped_by(self.session, 'source_files'))
self.assertEqual(stats['wheezy'], 1632)
Expand Down
12 changes: 6 additions & 6 deletions lib/debsources/tests/test_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,20 +280,20 @@ def tearDown(self):

@istest
def sizeMatchesReferenceDb(self):
EXPECTED_SIZE = 180728
EXPECTED_SIZE = 181628
self.assertEqual(EXPECTED_SIZE, self.stats['total.disk_usage'])

@istest
def statsMatchReferenceDb(self):
expected_stats = { # just a few samples
'total.ctags': 116832,
'total.ctags': 116833,
'debian_sid.ctags': 28723,
'debian_squeeze.ctags': 31015,
'debian_experimental.disk_usage': 12964,
'total.source_files': 9333,
'total.source_files': 9354,
'debian_experimental.source_files': 1396,
'debian_jessie.source_files': 2038,
'total.source_packages': 36,
'debian_jessie.source_files': 2059,
'total.source_packages': 37,
'debian_squeeze.source_packages': 13,
'debian_wheezy.source_packages': 12,
'debian_sid.sloccount.awk': 25,
Expand All @@ -305,7 +305,7 @@ def statsMatchReferenceDb(self):
'debian_wheezy.sloccount.python': 2798,
'debian_squeeze.sloccount.ruby': 193,
'debian_wheezy.sloccount.ruby': 193,
'total.sloccount': 1250830,
'total.sloccount': 1250903,
'total.sloccount.javascript': 212,
'debian_squeeze.sloccount': 315750,
}
Expand Down
9 changes: 9 additions & 0 deletions lib/debsources/tests/test_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from debsources.url import url_decode, url_encode


def test_url_encode():
assert url_encode("hello\udced") == "hello%ED"


def test_url_decode():
assert url_decode("hello%ED") == "hello\udced"
2 changes: 1 addition & 1 deletion lib/debsources/tests/test_web_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_api_packages_list(self):
rv = json.loads(
self.app.get('/copyright/api/list/').data)
self.assertIn({'name': "ocaml-curses"}, rv['packages'])
self.assertEqual(len(rv['packages']), 18)
self.assertEqual(len(rv['packages']), 19)

def test_api_by_prefix(self):
rv = json.loads(
Expand Down
2 changes: 1 addition & 1 deletion lib/debsources/tests/test_web_patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_api_packages_list(self):
rv = json.loads(
self.app.get('/patches/api/list/').data)
self.assertIn({'name': "ocaml-curses"}, rv['packages'])
self.assertEqual(len(rv['packages']), 18)
self.assertEqual(len(rv['packages']), 19)

def test_api_by_prefix(self):
rv = json.loads(
Expand Down
22 changes: 18 additions & 4 deletions lib/debsources/tests/test_webapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_static_pages(self):
def test_api_packages_list(self):
rv = json.loads(self.app.get('/api/list/').data)
self.assertIn({'name': "libcaca"}, rv['packages'])
self.assertEqual(len(rv['packages']), 18)
self.assertEqual(len(rv['packages']), 19)

def test_api_by_prefix(self):
rv = json.loads(self.app.get('/api/prefix/libc/').data)
Expand Down Expand Up @@ -554,9 +554,9 @@ def test_info_version(self):
def test_api_stats_suite(self):
rv = json.loads(self.app.get('/api/stats/jessie/').data)
self.assertEqual(rv["suite"], "jessie")
self.assertEqual(rv["results"]["debian_jessie.ctags"], 23815)
self.assertEqual(rv["results"]["debian_jessie.disk_usage"], 50528)
self.assertEqual(rv["results"]["debian_jessie.source_files"], 2038)
self.assertEqual(rv["results"]["debian_jessie.ctags"], 23816)
self.assertEqual(rv["results"]["debian_jessie.disk_usage"], 51428)
self.assertEqual(rv["results"]["debian_jessie.source_files"], 2059)
self.assertEqual(rv["results"]["debian_jessie.sloccount.python"], 2916)

def test_api_released_suite(self):
Expand Down Expand Up @@ -621,5 +621,19 @@ def test_news(self):
rv = self.app.get(news_routes[news_file])
self.assertIn(news_string, rv.data.decode())

def test_non_utf8_filename(self):
# List folder containing a non-utf8 filename.
rv = self.app.get('/src/aspell-is/0.51-0-4/')
self.assertEqual(200, rv.status_code)
self.assertIn(
b'<a href="/src/aspell-is/0.51-0-4/%25EDslenska.alias/">%EDslenska.alias</a>',
rv.data
)
# Visit that file.
rv = self.app.get('/src/aspell-is/0.51-0-4/%25EDslenska.alias/')
self.assertEqual(200, rv.status_code)
self.assertIn("<h2>File: %EDslenska.alias</h2>", rv.data)


if __name__ == '__main__':
unittest.main(exit=False)
17 changes: 17 additions & 0 deletions lib/debsources/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import urllib.parse


def url_encode(name: str) -> str:
"""Percent-encode a surrogate-escaped string for use in URIs.
E.g. hello\udced -> hello%ED
"""
return urllib.parse.quote(bytes(name, 'utf8', 'surrogateescape'))


def url_decode(url: str) -> str:
"""Percent-decode an URI with byte characters into a surrogate-escaped string.
E.g. hello%ED -> hello\udced
"""
return urllib.parse.unquote(url, 'utf8', 'surrogateescape')
2 changes: 1 addition & 1 deletion testdata
Submodule testdata updated from 370ace to bd9a96

0 comments on commit 0445961

Please sign in to comment.