Skip to content

Commit ebf478b

Browse files
committed
Use the single-argument URI constructor in URIs.parseLeniently()
As the multi-argument constructor quotes '%' we were incorrectly double-encoding existing escapes. This change also corrects handling the empty path case. Fixes #90 Fixes #91
1 parent 830a6e7 commit ebf478b

File tree

3 files changed

+143
-17
lines changed

3 files changed

+143
-17
lines changed

src/org/netpreserve/jwarc/URIs.java

+108-15
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,45 @@
66
import java.nio.CharBuffer;
77
import java.nio.charset.CharsetDecoder;
88
import java.nio.charset.CoderResult;
9-
import java.nio.charset.StandardCharsets;
109
import java.util.*;
1110
import java.util.regex.Matcher;
1211
import java.util.regex.Pattern;
1312

13+
import static java.nio.charset.StandardCharsets.UTF_8;
1414
import static java.util.regex.Pattern.DOTALL;
1515

1616
public class URIs {
1717
private final static Pattern URL_REGEX = Pattern.compile("\\A" +
1818
"(?:([a-zA-Z][^:]*):)?" + // scheme
19-
"[/\\\\\\r\\n\\t]*" + // slashes
20-
"([^/\\\\]*)" + // authority
19+
"([/\\\\\\r\\n\\t]*)" + // slashes
20+
"([^/\\\\?#]*)" + // authority
2121
"([/\\\\][^?#]*)?" + // path
2222
"(?:[?]([^#]*))?" + // query
2323
"(?:[#](.*))?" + // fragment
2424
"\\Z", DOTALL);
25+
private static final int SCHEME = 1, SLASHES = 2, AUTHORITY = 3, PATH = 4, QUERY = 5, FRAGMENT = 6;
2526
private final static Pattern AUTHORITY_REGEX = Pattern.compile("([^@]*@)?(.*?)(?::([0-9]+))?", DOTALL);
2627
private final static Pattern IPV4_REGEX = Pattern.compile("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}");
2728

29+
// According to https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/net/URI.html#uri-syntax-and-components-heading
30+
private static final String ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
31+
private static final String DIGIT = "0123456789";
32+
private static final String ALPHANUM = ALPHA + DIGIT;
33+
private static final String UNRESERVED = ALPHANUM + "_-!.~'()*";
34+
private static final String PUNCT = ",;:$&+=";
35+
private static final String RESERVED = PUNCT + "?/[]@";
36+
37+
private static final BitSet PATH_ALLOWED = charBitSet("/@" + UNRESERVED + PUNCT);
38+
private static final BitSet QUERY_ALLOWED = charBitSet(UNRESERVED + RESERVED);
39+
40+
private static BitSet charBitSet(String chars) {
41+
BitSet bitSet = new BitSet(128);
42+
for (char c : chars.toCharArray()) {
43+
bitSet.set(c);
44+
}
45+
return bitSet;
46+
}
47+
2848
/**
2949
* Returns true if the given string begins with a http: or https: URI scheme. Does not enforce the string is a
3050
* valid URI.
@@ -37,27 +57,100 @@ private static boolean startsWithIgnoreCase(String string, String prefix) {
3757
return string.regionMatches(true, 0, prefix, 0, prefix.length());
3858
}
3959

60+
/**
61+
* Like URI.create() but attempts to percent encode when possible instead of throwing.
62+
* Note that parseLeniently(s).toString().equals(s) may be false if percent encoding has occurred.
63+
* @throws IllegalArgumentException if parsing failed
64+
*/
4065
public static URI parseLeniently(String uri) {
41-
Matcher m = URL_REGEX.matcher(uri);
42-
if (!m.matches()) {
43-
throw new IllegalArgumentException();
44-
}
4566
try {
46-
return new URI(m.group(1), m.group(2), m.group(3), m.group(4), m.group(5));
67+
return new URI(uri);
4768
} catch (URISyntaxException e) {
48-
throw new IllegalArgumentException(e);
69+
Matcher urlMatcher = URL_REGEX.matcher(uri);
70+
if (!urlMatcher.matches()) {
71+
throw new IllegalArgumentException("invalid URI: " + uri);
72+
}
73+
74+
StringBuilder builder = new StringBuilder();
75+
76+
String scheme = urlMatcher.group(SCHEME);
77+
if (scheme != null) {
78+
builder.append(scheme);
79+
builder.append(':');
80+
}
81+
82+
String slashes = urlMatcher.group(SLASHES);
83+
if (slashes != null) builder.append(slashes);
84+
85+
String authority = urlMatcher.group(AUTHORITY);
86+
if (authority != null) {
87+
builder.append(authority);
88+
}
89+
90+
String path = urlMatcher.group(PATH);
91+
if (path != null) {
92+
builder.append(percentEncodeIfNeeded(path, PATH_ALLOWED));
93+
}
94+
95+
String query = urlMatcher.group(QUERY);
96+
if (query != null) {
97+
builder.append('?');
98+
builder.append(percentEncodeIfNeeded(query, QUERY_ALLOWED));
99+
}
100+
101+
String fragment = urlMatcher.group(FRAGMENT);
102+
if (fragment != null) {
103+
builder.append('#');
104+
builder.append(percentEncodeIfNeeded(fragment, QUERY_ALLOWED));
105+
}
106+
107+
return URI.create(builder.toString());
108+
}
109+
}
110+
111+
private static boolean isHexDigit(char c) {
112+
return (c >= '0' && c <= '9')
113+
|| (c >= 'a' && c <= 'f')
114+
|| (c >= 'A' && c <= 'F');
115+
}
116+
117+
private static boolean isASCII(char c) {
118+
return c <= 127;
119+
}
120+
121+
/**
122+
* Percent encodes a string per the given set of allowed characters. Valid existing percent escapes are
123+
* preserved instead of double escaped. Unicode characters which are not ASCII, control or space characters
124+
* are not encoded.
125+
*/
126+
private static String percentEncodeIfNeeded(String s, BitSet allowed) {
127+
StringBuilder out = new StringBuilder();
128+
for (int i = 0; i < s.length(); i++) {
129+
char c = s.charAt(i);
130+
if (allowed.get(c)) {
131+
out.append(c);
132+
} else if (c == '%' && i < s.length() - 2 && isHexDigit(s.charAt(i + 1)) && isHexDigit(s.charAt(i + 2))) {
133+
out.append(c); // valid existing escape
134+
} else if (!isASCII(c) && !Character.isISOControl(c) && !Character.isSpaceChar(c)) {
135+
out.append(c); // an 'other' unicode character
136+
} else {
137+
for (byte b : Character.toString(c).getBytes(UTF_8)) {
138+
out.append('%').append(String.format("%02x", (int) b));
139+
}
140+
}
49141
}
142+
return out.toString();
50143
}
51144

52145
public static String toNormalizedSurt(String uri) {
53146
Matcher urlMatcher = URL_REGEX.matcher(uri);
54147
if (!urlMatcher.matches()) {
55148
throw new IllegalArgumentException("invalid URL: " + uri);
56149
}
57-
String authority = urlMatcher.group(2);
58-
String path = urlMatcher.group(3);
59-
String query = urlMatcher.group(4);
60-
String fragment = urlMatcher.group(5);
150+
String authority = urlMatcher.group(AUTHORITY);
151+
String path = urlMatcher.group(PATH);
152+
String query = urlMatcher.group(QUERY);
153+
String fragment = urlMatcher.group(FRAGMENT);
61154

62155
Matcher authorityMatcher = AUTHORITY_REGEX.matcher(authority);
63156
if (!authorityMatcher.matches()) throw new IllegalStateException("authority didn't match");
@@ -133,7 +226,7 @@ private static String fullyPercentDecode(String s) {
133226

134227
public static String percentEncodeIllegals(String s) {
135228
StringBuilder out = new StringBuilder();
136-
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
229+
byte[] bytes = s.getBytes(UTF_8);
137230
for (byte rawByte : bytes) {
138231
int b = rawByte & 0xff;
139232
if (b == '%' || b == '#' || b <= 0x20 || b >= 0x7f) {
@@ -178,7 +271,7 @@ private static String percentDecode(String s) {
178271
}
179272

180273
private static void tryDecodeUtf8(ByteBuffer bb, StringBuilder out) {
181-
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
274+
CharsetDecoder decoder = UTF_8.newDecoder();
182275
CharBuffer cb = CharBuffer.allocate(bb.remaining());
183276
while (bb.hasRemaining()) {
184277
CoderResult result = decoder.decode(bb, cb, true);

src/org/netpreserve/jwarc/WarcTargetRecord.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ public String target() {
4444
/**
4545
* The URI of the original target resource this record holds information about.
4646
* <p>
47-
* The {@link #target()} method should be preferred unless you actually need an instance of URI as some WARC files
48-
* may contain a value of WARC-Target-URI that cannot be represented as a Java URI instance without changing them.
47+
* This method uses URIs.parseLeniently() to percent encode characters that are rejected by the URI class and so may
48+
* return a value that is not identical to the value of the WARC-Target-URI field. Using {@link #target()} should
49+
* be preferred unless you actually need an instance of the URI class.
4950
*/
5051
public URI targetURI() {
5152
return URIs.parseLeniently(target());

test/org/netpreserve/jwarc/URIsTest.java

+32
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,36 @@ public class URIsTest {
99
public void toNormalizedSurt() {
1010
assertEquals("org,example:8080)/foo?&&a&b&c", URIs.toNormalizedSurt("http://wWw.EXAMPLE.org:8080/FOO?c&A&&&b"));
1111
}
12+
13+
@Test
14+
public void testParseLeniently() {
15+
roundtripParseLeniently("");
16+
roundtripParseLeniently("https://www.example.com#anchor");
17+
roundtripParseLeniently("https://example.com?a=b&cd[]=4");
18+
roundtripParseLeniently("/path/to/resource");
19+
roundtripParseLeniently("http://[2001:db8::1]/resource");
20+
roundtripParseLeniently("https://example.com/path%20with%20spaces");
21+
roundtripParseLeniently("https://example.com#fragment%20with%20spaces");
22+
roundtripParseLeniently("https://example.com?query%20with%20spaces");
23+
roundtripParseLeniently("https://example.com/路径");
24+
roundtripParseLeniently("https://example.com?query=测试");
25+
roundtripParseLeniently("https://////example.com?query=测试");
26+
roundtripParseLeniently("https://www.prijmeni.cz/Kr%C3%A1kora");
27+
roundtripParseLeniently("https://dx.doi.org/10.1038%2F35008096");
28+
29+
assertEquals("https://example.com/path%20with%20spaces", URIs.parseLeniently("https://example.com/path with spaces").toString());
30+
assertEquals("https://example.com?query%20with%20spaces", URIs.parseLeniently("https://example.com?query with spaces").toString());
31+
assertEquals("https://example.com#fragment%20with%20spaces", URIs.parseLeniently("https://example.com#fragment with spaces").toString());
32+
assertEquals("https://example.com/a%20b%25", URIs.parseLeniently("https://example.com/a b%25").toString());
33+
assertEquals("https://example.com/a%20b路径", URIs.parseLeniently("https://example.com/a b路径").toString());
34+
assertEquals("https://example.com?a%20b%25", URIs.parseLeniently("https://example.com?a b%25").toString());
35+
assertEquals("https://example.com?a%20b路径", URIs.parseLeniently("https://example.com?a b路径").toString());
36+
assertEquals("https://example.com#a%20b%25", URIs.parseLeniently("https://example.com#a b%25").toString());
37+
assertEquals("https://example.com/a%20b%25路径%5b?a%20b%25路径[?#a%20b%25路径[?", URIs.parseLeniently("https://example.com/a b%25路径[?a b%25路径[?#a b%25路径[?").toString());
38+
assertEquals("https://example.com/a%20b?c%20d#e%20f", URIs.parseLeniently("https://example.com/a b?c d#e f").toString());
39+
}
40+
41+
private void roundtripParseLeniently(String s) {
42+
assertEquals(s, URIs.parseLeniently(s).toString());
43+
}
1244
}

0 commit comments

Comments
 (0)