6
6
import java .nio .CharBuffer ;
7
7
import java .nio .charset .CharsetDecoder ;
8
8
import java .nio .charset .CoderResult ;
9
- import java .nio .charset .StandardCharsets ;
10
9
import java .util .*;
11
10
import java .util .regex .Matcher ;
12
11
import java .util .regex .Pattern ;
13
12
13
+ import static java .nio .charset .StandardCharsets .UTF_8 ;
14
14
import static java .util .regex .Pattern .DOTALL ;
15
15
16
16
public class URIs {
17
17
private final static Pattern URL_REGEX = Pattern .compile ("\\ A" +
18
18
"(?:([a-zA-Z][^:]*):)?" + // scheme
19
- "[/\\ \\ \\ r\\ n\\ t]*" + // slashes
20
- "([^/\\ \\ ]*)" + // authority
19
+ "( [/\\ \\ \\ r\\ n\\ t]*) " + // slashes
20
+ "([^/\\ \\ ?# ]*)" + // authority
21
21
"([/\\ \\ ][^?#]*)?" + // path
22
22
"(?:[?]([^#]*))?" + // query
23
23
"(?:[#](.*))?" + // fragment
24
24
"\\ Z" , DOTALL );
25
+ private static final int SCHEME = 1 , SLASHES = 2 , AUTHORITY = 3 , PATH = 4 , QUERY = 5 , FRAGMENT = 6 ;
25
26
private final static Pattern AUTHORITY_REGEX = Pattern .compile ("([^@]*@)?(.*?)(?::([0-9]+))?" , DOTALL );
26
27
private final static Pattern IPV4_REGEX = Pattern .compile ("[0-9]{1,3}\\ .[0-9]{1,3}\\ .[0-9]{1,3}\\ .[0-9]{1,3}" );
27
28
29
+ // According to https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/net/URI.html#uri-syntax-and-components-heading
30
+ private static final String ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ;
31
+ private static final String DIGIT = "0123456789" ;
32
+ private static final String ALPHANUM = ALPHA + DIGIT ;
33
+ private static final String UNRESERVED = ALPHANUM + "_-!.~'()*" ;
34
+ private static final String PUNCT = ",;:$&+=" ;
35
+ private static final String RESERVED = PUNCT + "?/[]@" ;
36
+
37
+ private static final BitSet PATH_ALLOWED = charBitSet ("/@" + UNRESERVED + PUNCT );
38
+ private static final BitSet QUERY_ALLOWED = charBitSet (UNRESERVED + RESERVED );
39
+
40
+ private static BitSet charBitSet (String chars ) {
41
+ BitSet bitSet = new BitSet (128 );
42
+ for (char c : chars .toCharArray ()) {
43
+ bitSet .set (c );
44
+ }
45
+ return bitSet ;
46
+ }
47
+
28
48
/**
29
49
* Returns true if the given string begins with a http: or https: URI scheme. Does not enforce the string is a
30
50
* valid URI.
@@ -37,27 +57,100 @@ private static boolean startsWithIgnoreCase(String string, String prefix) {
37
57
return string .regionMatches (true , 0 , prefix , 0 , prefix .length ());
38
58
}
39
59
60
+ /**
61
+ * Like URI.create() but attempts to percent encode when possible instead of throwing.
62
+ * Note that parseLeniently(s).toString().equals(s) may be false if percent encoding has occurred.
63
+ * @throws IllegalArgumentException if parsing failed
64
+ */
40
65
public static URI parseLeniently (String uri ) {
41
- Matcher m = URL_REGEX .matcher (uri );
42
- if (!m .matches ()) {
43
- throw new IllegalArgumentException ();
44
- }
45
66
try {
46
- return new URI (m . group ( 1 ), m . group ( 2 ), m . group ( 3 ), m . group ( 4 ), m . group ( 5 ) );
67
+ return new URI (uri );
47
68
} catch (URISyntaxException e ) {
48
- throw new IllegalArgumentException (e );
69
+ Matcher urlMatcher = URL_REGEX .matcher (uri );
70
+ if (!urlMatcher .matches ()) {
71
+ throw new IllegalArgumentException ("invalid URI: " + uri );
72
+ }
73
+
74
+ StringBuilder builder = new StringBuilder ();
75
+
76
+ String scheme = urlMatcher .group (SCHEME );
77
+ if (scheme != null ) {
78
+ builder .append (scheme );
79
+ builder .append (':' );
80
+ }
81
+
82
+ String slashes = urlMatcher .group (SLASHES );
83
+ if (slashes != null ) builder .append (slashes );
84
+
85
+ String authority = urlMatcher .group (AUTHORITY );
86
+ if (authority != null ) {
87
+ builder .append (authority );
88
+ }
89
+
90
+ String path = urlMatcher .group (PATH );
91
+ if (path != null ) {
92
+ builder .append (percentEncodeIfNeeded (path , PATH_ALLOWED ));
93
+ }
94
+
95
+ String query = urlMatcher .group (QUERY );
96
+ if (query != null ) {
97
+ builder .append ('?' );
98
+ builder .append (percentEncodeIfNeeded (query , QUERY_ALLOWED ));
99
+ }
100
+
101
+ String fragment = urlMatcher .group (FRAGMENT );
102
+ if (fragment != null ) {
103
+ builder .append ('#' );
104
+ builder .append (percentEncodeIfNeeded (fragment , QUERY_ALLOWED ));
105
+ }
106
+
107
+ return URI .create (builder .toString ());
108
+ }
109
+ }
110
+
111
+ private static boolean isHexDigit (char c ) {
112
+ return (c >= '0' && c <= '9' )
113
+ || (c >= 'a' && c <= 'f' )
114
+ || (c >= 'A' && c <= 'F' );
115
+ }
116
+
117
+ private static boolean isASCII (char c ) {
118
+ return c <= 127 ;
119
+ }
120
+
121
+ /**
122
+ * Percent encodes a string per the given set of allowed characters. Valid existing percent escapes are
123
+ * preserved instead of double escaped. Unicode characters which are not ASCII, control or space characters
124
+ * are not encoded.
125
+ */
126
+ private static String percentEncodeIfNeeded (String s , BitSet allowed ) {
127
+ StringBuilder out = new StringBuilder ();
128
+ for (int i = 0 ; i < s .length (); i ++) {
129
+ char c = s .charAt (i );
130
+ if (allowed .get (c )) {
131
+ out .append (c );
132
+ } else if (c == '%' && i < s .length () - 2 && isHexDigit (s .charAt (i + 1 )) && isHexDigit (s .charAt (i + 2 ))) {
133
+ out .append (c ); // valid existing escape
134
+ } else if (!isASCII (c ) && !Character .isISOControl (c ) && !Character .isSpaceChar (c )) {
135
+ out .append (c ); // an 'other' unicode character
136
+ } else {
137
+ for (byte b : Character .toString (c ).getBytes (UTF_8 )) {
138
+ out .append ('%' ).append (String .format ("%02x" , (int ) b ));
139
+ }
140
+ }
49
141
}
142
+ return out .toString ();
50
143
}
51
144
52
145
public static String toNormalizedSurt (String uri ) {
53
146
Matcher urlMatcher = URL_REGEX .matcher (uri );
54
147
if (!urlMatcher .matches ()) {
55
148
throw new IllegalArgumentException ("invalid URL: " + uri );
56
149
}
57
- String authority = urlMatcher .group (2 );
58
- String path = urlMatcher .group (3 );
59
- String query = urlMatcher .group (4 );
60
- String fragment = urlMatcher .group (5 );
150
+ String authority = urlMatcher .group (AUTHORITY );
151
+ String path = urlMatcher .group (PATH );
152
+ String query = urlMatcher .group (QUERY );
153
+ String fragment = urlMatcher .group (FRAGMENT );
61
154
62
155
Matcher authorityMatcher = AUTHORITY_REGEX .matcher (authority );
63
156
if (!authorityMatcher .matches ()) throw new IllegalStateException ("authority didn't match" );
@@ -133,7 +226,7 @@ private static String fullyPercentDecode(String s) {
133
226
134
227
public static String percentEncodeIllegals (String s ) {
135
228
StringBuilder out = new StringBuilder ();
136
- byte [] bytes = s .getBytes (StandardCharsets . UTF_8 );
229
+ byte [] bytes = s .getBytes (UTF_8 );
137
230
for (byte rawByte : bytes ) {
138
231
int b = rawByte & 0xff ;
139
232
if (b == '%' || b == '#' || b <= 0x20 || b >= 0x7f ) {
@@ -178,7 +271,7 @@ private static String percentDecode(String s) {
178
271
}
179
272
180
273
private static void tryDecodeUtf8 (ByteBuffer bb , StringBuilder out ) {
181
- CharsetDecoder decoder = StandardCharsets . UTF_8 .newDecoder ();
274
+ CharsetDecoder decoder = UTF_8 .newDecoder ();
182
275
CharBuffer cb = CharBuffer .allocate (bb .remaining ());
183
276
while (bb .hasRemaining ()) {
184
277
CoderResult result = decoder .decode (bb , cb , true );
0 commit comments