| Avi Drissman | e4622aa | 2022-09-08 20:36:06 | [diff] [blame] | 1 | // Copyright 2020 The Chromium Authors |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 5 | #include "base/strings/escape.h" |
| 6 | |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 7 | #include <algorithm> |
| 8 | #include <string> |
| 9 | |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 10 | #include "base/strings/string_util.h" |
| Victor Vasiliev | dc7e817 | 2022-04-20 23:45:20 | [diff] [blame] | 11 | #include "base/strings/stringprintf.h" |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 12 | #include "base/strings/utf_string_conversions.h" |
| 13 | #include "testing/gtest/include/gtest/gtest.h" |
| 14 | |
| 15 | namespace base { |
| Victor Vasiliev | dc7e817 | 2022-04-20 23:45:20 | [diff] [blame] | 16 | namespace { |
| 17 | |
| 18 | struct EscapeCase { |
| 19 | const char* input; |
| 20 | const char* output; |
| 21 | }; |
| 22 | |
| 23 | struct EscapeForHTMLCase { |
| 24 | const char* input; |
| 25 | const char* expected_output; |
| 26 | }; |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 27 | |
| 28 | struct UnescapeURLCase { |
| 29 | const char* input; |
| 30 | UnescapeRule::Type rules; |
| 31 | const char* output; |
| 32 | }; |
| 33 | |
| 34 | struct UnescapeAndDecodeCase { |
| 35 | const char* input; |
| 36 | |
| 37 | // The expected output when run through UnescapeURL. |
| 38 | const char* url_unescaped; |
| 39 | |
| 40 | // The expected output when run through UnescapeQuery. |
| 41 | const char* query_unescaped; |
| 42 | |
| 43 | // The expected output when run through UnescapeAndDecodeURLComponent. |
| 44 | const wchar_t* decoded; |
| 45 | }; |
| 46 | |
| 47 | struct AdjustOffsetCase { |
| 48 | const char* input; |
| 49 | size_t input_offset; |
| 50 | size_t output_offset; |
| 51 | }; |
| 52 | |
| Victor Vasiliev | dc7e817 | 2022-04-20 23:45:20 | [diff] [blame] | 53 | TEST(EscapeTest, EscapeTextForFormSubmission) { |
| 54 | const EscapeCase escape_cases[] = { |
| 55 | {"foo", "foo"}, {"foo bar", "foo+bar"}, {"foo++", "foo%2B%2B"}}; |
| 56 | for (const auto& escape_case : escape_cases) { |
| 57 | EXPECT_EQ(escape_case.output, |
| 58 | EscapeQueryParamValue(escape_case.input, true)); |
| 59 | } |
| 60 | |
| 61 | const EscapeCase escape_cases_no_plus[] = { |
| 62 | {"foo", "foo"}, {"foo bar", "foo%20bar"}, {"foo++", "foo%2B%2B"}}; |
| 63 | for (const auto& escape_case : escape_cases_no_plus) { |
| 64 | EXPECT_EQ(escape_case.output, |
| 65 | EscapeQueryParamValue(escape_case.input, false)); |
| 66 | } |
| 67 | |
| 68 | // Test all the values in we're supposed to be escaping. |
| 69 | const std::string no_escape( |
| 70 | "abcdefghijklmnopqrstuvwxyz" |
| 71 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 72 | "0123456789" |
| 73 | "!'()*-._~"); |
| 74 | for (int i = 0; i < 256; ++i) { |
| 75 | std::string in; |
| 76 | in.push_back(i); |
| 77 | std::string out = EscapeQueryParamValue(in, true); |
| 78 | if (0 == i) { |
| 79 | EXPECT_EQ(out, std::string("%00")); |
| 80 | } else if (32 == i) { |
| 81 | // Spaces are plus escaped like web forms. |
| 82 | EXPECT_EQ(out, std::string("+")); |
| 83 | } else if (no_escape.find(in) == std::string::npos) { |
| 84 | // Check %hex escaping |
| 85 | std::string expected = StringPrintf("%%%02X", i); |
| 86 | EXPECT_EQ(expected, out); |
| 87 | } else { |
| 88 | // No change for things in the no_escape list. |
| 89 | EXPECT_EQ(out, in); |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | TEST(EscapeTest, EscapePath) { |
| 95 | ASSERT_EQ( |
| 96 | // Most of the character space we care about, un-escaped |
| 97 | EscapePath("\x02\n\x1d !\"#$%&'()*+,-./0123456789:;" |
| 98 | "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 99 | "[\\]^_`abcdefghijklmnopqrstuvwxyz" |
| 100 | "{|}~\x7f\x80\xff"), |
| 101 | // Escaped |
| 102 | "%02%0A%1D%20!%22%23$%25&'()*+,-./0123456789%3A;" |
| 103 | "%3C=%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 104 | "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz" |
| 105 | "%7B%7C%7D~%7F%80%FF"); |
| 106 | } |
| 107 | |
| 108 | TEST(EscapeTest, EscapeUrlEncodedData) { |
| 109 | ASSERT_EQ( |
| 110 | // Most of the character space we care about, un-escaped |
| 111 | EscapeUrlEncodedData("\x02\n\x1d !\"#$%&'()*+,-./0123456789:;" |
| 112 | "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 113 | "[\\]^_`abcdefghijklmnopqrstuvwxyz" |
| 114 | "{|}~\x7f\x80\xff", |
| 115 | true), |
| 116 | // Escaped |
| 117 | "%02%0A%1D+!%22%23%24%25%26%27()*%2B,-./0123456789:%3B" |
| 118 | "%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 119 | "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz" |
| 120 | "%7B%7C%7D~%7F%80%FF"); |
| 121 | } |
| 122 | |
| 123 | TEST(EscapeTest, EscapeUrlEncodedDataSpace) { |
| 124 | ASSERT_EQ(EscapeUrlEncodedData("a b", true), "a+b"); |
| 125 | ASSERT_EQ(EscapeUrlEncodedData("a b", false), "a%20b"); |
| 126 | } |
| 127 | |
| 128 | TEST(EscapeTest, EscapeForHTML) { |
| 129 | const EscapeForHTMLCase tests[] = { |
| 130 | {"hello", "hello"}, |
| 131 | {"<hello>", "<hello>"}, |
| 132 | {"don\'t mess with me", "don't mess with me"}, |
| 133 | }; |
| 134 | for (const auto& test : tests) { |
| 135 | std::string result = EscapeForHTML(std::string(test.input)); |
| 136 | EXPECT_EQ(std::string(test.expected_output), result); |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | TEST(EscapeTest, UnescapeForHTML) { |
| 141 | const EscapeForHTMLCase tests[] = { |
| 142 | {"", ""}, |
| 143 | {"<hello>", "<hello>"}, |
| 144 | {"don't mess with me", "don\'t mess with me"}, |
| 145 | {"<>&"'", "<>&\"'"}, |
| 146 | {"& lt; & ; &; '", "& lt; & ; &; '"}, |
| 147 | {"&", "&"}, |
| 148 | {""", "\""}, |
| 149 | {"'", "'"}, |
| 150 | {"<", "<"}, |
| 151 | {">", ">"}, |
| 152 | {"& &", "& &"}, |
| 153 | }; |
| 154 | for (const auto& test : tests) { |
| 155 | std::u16string result = UnescapeForHTML(ASCIIToUTF16(test.input)); |
| 156 | EXPECT_EQ(ASCIIToUTF16(test.expected_output), result); |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | TEST(EscapeTest, EscapeExternalHandlerValue) { |
| 161 | ASSERT_EQ( |
| 162 | // Escaped |
| 163 | "%02%0A%1D%20!%22#$%25&'()*+,-./0123456789:;" |
| 164 | "%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 165 | "[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz" |
| 166 | "%7B%7C%7D~%7F%80%FF", |
| 167 | // Most of the character space we care about, un-escaped |
| 168 | EscapeExternalHandlerValue("\x02\n\x1d !\"#$%&'()*+,-./0123456789:;" |
| 169 | "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 170 | "[\\]^_`abcdefghijklmnopqrstuvwxyz" |
| 171 | "{|}~\x7f\x80\xff")); |
| 172 | |
| 173 | ASSERT_EQ( |
| 174 | "!#$&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_" |
| 175 | "abcdefghijklmnopqrstuvwxyz~", |
| 176 | EscapeExternalHandlerValue( |
| 177 | "!#$&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_" |
| 178 | "abcdefghijklmnopqrstuvwxyz~")); |
| 179 | |
| 180 | ASSERT_EQ("%258k", EscapeExternalHandlerValue("%8k")); |
| 181 | ASSERT_EQ("a%25", EscapeExternalHandlerValue("a%")); |
| 182 | ASSERT_EQ("%25a", EscapeExternalHandlerValue("%a")); |
| 183 | ASSERT_EQ("a%258", EscapeExternalHandlerValue("a%8")); |
| 184 | ASSERT_EQ("%ab", EscapeExternalHandlerValue("%ab")); |
| 185 | ASSERT_EQ("%AB", EscapeExternalHandlerValue("%AB")); |
| 186 | |
| 187 | ASSERT_EQ("http://example.com/path/sub?q=a%7Cb%7Cc&q=1%7C2%7C3#ref%7C", |
| 188 | EscapeExternalHandlerValue( |
| 189 | "http://example.com/path/sub?q=a|b|c&q=1|2|3#ref|")); |
| 190 | ASSERT_EQ("http://example.com/path/sub?q=a%7Cb%7Cc&q=1%7C2%7C3#ref%7C", |
| 191 | EscapeExternalHandlerValue( |
| 192 | "http://example.com/path/sub?q=a%7Cb%7Cc&q=1%7C2%7C3#ref%7C")); |
| 193 | ASSERT_EQ("http://[2001:db8:0:1]:80", |
| 194 | EscapeExternalHandlerValue("http://[2001:db8:0:1]:80")); |
| 195 | } |
| 196 | |
| 197 | TEST(EscapeTest, EscapeNonASCII) { |
| 198 | EXPECT_EQ("abc\n%2580%80", EscapeNonASCIIAndPercent("abc\n%80\x80")); |
| 199 | EXPECT_EQ("abc\n%80%80", EscapeNonASCII("abc\n%80\x80")); |
| 200 | } |
| 201 | |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 202 | TEST(EscapeTest, DataURLWithAccentedCharacters) { |
| 203 | const std::string url = |
| 204 | "text/html;charset=utf-8,%3Chtml%3E%3Cbody%3ETonton,%20ton%20th%C3" |
| 205 | "%A9%20t'a-t-il%20%C3%B4t%C3%A9%20ta%20toux%20"; |
| 206 | |
| 207 | OffsetAdjuster::Adjustments adjustments; |
| 208 | UnescapeAndDecodeUTF8URLComponentWithAdjustments(url, UnescapeRule::SPACES, |
| 209 | &adjustments); |
| 210 | } |
| 211 | |
| 212 | TEST(EscapeTest, UnescapeURLComponent) { |
| 213 | const UnescapeURLCase kUnescapeCases[] = { |
| 214 | {"", UnescapeRule::NORMAL, ""}, |
| 215 | {"%2", UnescapeRule::NORMAL, "%2"}, |
| 216 | {"%%%%%%", UnescapeRule::NORMAL, "%%%%%%"}, |
| 217 | {"Don't escape anything", UnescapeRule::NORMAL, "Don't escape anything"}, |
| 218 | {"Invalid %escape %2", UnescapeRule::NORMAL, "Invalid %escape %2"}, |
| 219 | {"Some%20random text %25%2dOK", UnescapeRule::NONE, |
| 220 | "Some%20random text %25%2dOK"}, |
| 221 | {"Some%20random text %25%2dOK", UnescapeRule::NORMAL, |
| 222 | "Some%20random text %25-OK"}, |
| 223 | {"Some%20random text %25%E1%A6", UnescapeRule::NORMAL, |
| 224 | "Some%20random text %25\xE1\xA6"}, |
| 225 | {"Some%20random text %25%E1%A6OK", UnescapeRule::NORMAL, |
| 226 | "Some%20random text %25\xE1\xA6OK"}, |
| 227 | {"Some%20random text %25%E1%A6%99OK", UnescapeRule::NORMAL, |
| 228 | "Some%20random text %25\xE1\xA6\x99OK"}, |
| 229 | |
| 230 | // BiDi Control characters should not be unescaped. |
| 231 | {"Some%20random text %25%D8%9COK", UnescapeRule::NORMAL, |
| 232 | "Some%20random text %25%D8%9COK"}, |
| 233 | {"Some%20random text %25%E2%80%8EOK", UnescapeRule::NORMAL, |
| 234 | "Some%20random text %25%E2%80%8EOK"}, |
| 235 | {"Some%20random text %25%E2%80%8FOK", UnescapeRule::NORMAL, |
| 236 | "Some%20random text %25%E2%80%8FOK"}, |
| 237 | {"Some%20random text %25%E2%80%AAOK", UnescapeRule::NORMAL, |
| 238 | "Some%20random text %25%E2%80%AAOK"}, |
| 239 | {"Some%20random text %25%E2%80%ABOK", UnescapeRule::NORMAL, |
| 240 | "Some%20random text %25%E2%80%ABOK"}, |
| 241 | {"Some%20random text %25%E2%80%AEOK", UnescapeRule::NORMAL, |
| 242 | "Some%20random text %25%E2%80%AEOK"}, |
| 243 | {"Some%20random text %25%E2%81%A6OK", UnescapeRule::NORMAL, |
| 244 | "Some%20random text %25%E2%81%A6OK"}, |
| 245 | {"Some%20random text %25%E2%81%A9OK", UnescapeRule::NORMAL, |
| 246 | "Some%20random text %25%E2%81%A9OK"}, |
| 247 | |
| 248 | // Certain banned characters should not be unescaped. |
| 249 | // U+1F50F LOCK WITH INK PEN |
| 250 | {"Some%20random text %25%F0%9F%94%8FOK", UnescapeRule::NORMAL, |
| 251 | "Some%20random text %25%F0%9F%94%8FOK"}, |
| 252 | // U+1F510 CLOSED LOCK WITH KEY |
| 253 | {"Some%20random text %25%F0%9F%94%90OK", UnescapeRule::NORMAL, |
| 254 | "Some%20random text %25%F0%9F%94%90OK"}, |
| 255 | // U+1F512 LOCK |
| 256 | {"Some%20random text %25%F0%9F%94%92OK", UnescapeRule::NORMAL, |
| 257 | "Some%20random text %25%F0%9F%94%92OK"}, |
| 258 | // U+1F513 OPEN LOCK |
| 259 | {"Some%20random text %25%F0%9F%94%93OK", UnescapeRule::NORMAL, |
| 260 | "Some%20random text %25%F0%9F%94%93OK"}, |
| 261 | |
| 262 | // Spaces |
| 263 | {"(%C2%85)(%C2%A0)(%E1%9A%80)(%E2%80%80)", UnescapeRule::NORMAL, |
| 264 | "(%C2%85)(%C2%A0)(%E1%9A%80)(%E2%80%80)"}, |
| 265 | {"(%E2%80%81)(%E2%80%82)(%E2%80%83)(%E2%80%84)", UnescapeRule::NORMAL, |
| 266 | "(%E2%80%81)(%E2%80%82)(%E2%80%83)(%E2%80%84)"}, |
| 267 | {"(%E2%80%85)(%E2%80%86)(%E2%80%87)(%E2%80%88)", UnescapeRule::NORMAL, |
| 268 | "(%E2%80%85)(%E2%80%86)(%E2%80%87)(%E2%80%88)"}, |
| 269 | {"(%E2%80%89)(%E2%80%8A)(%E2%80%A8)(%E2%80%A9)", UnescapeRule::NORMAL, |
| 270 | "(%E2%80%89)(%E2%80%8A)(%E2%80%A8)(%E2%80%A9)"}, |
| 271 | {"(%E2%80%AF)(%E2%81%9F)(%E3%80%80)", UnescapeRule::NORMAL, |
| 272 | "(%E2%80%AF)(%E2%81%9F)(%E3%80%80)"}, |
| 273 | {"(%E2%A0%80)", UnescapeRule::NORMAL, "(%E2%A0%80)"}, |
| 274 | |
| 275 | // Default Ignorable and Formatting characters should not be unescaped. |
| 276 | {"(%E2%81%A5)(%EF%BF%B0)(%EF%BF%B8)", UnescapeRule::NORMAL, |
| 277 | "(%E2%81%A5)(%EF%BF%B0)(%EF%BF%B8)"}, |
| 278 | {"(%F3%A0%82%80)(%F3%A0%83%BF)(%F3%A0%87%B0)", UnescapeRule::NORMAL, |
| 279 | "(%F3%A0%82%80)(%F3%A0%83%BF)(%F3%A0%87%B0)"}, |
| 280 | {"(%F3%A0%BF%BF)(%C2%AD)(%CD%8F)", UnescapeRule::NORMAL, |
| 281 | "(%F3%A0%BF%BF)(%C2%AD)(%CD%8F)"}, |
| 282 | {"(%D8%80%20)(%D8%85)(%DB%9D)(%DC%8F)(%E0%A3%A2)", UnescapeRule::NORMAL, |
| 283 | "(%D8%80%20)(%D8%85)(%DB%9D)(%DC%8F)(%E0%A3%A2)"}, |
| 284 | {"(%E1%85%9F)(%E1%85%A0)(%E1%9E%B4)(%E1%9E%B5)", UnescapeRule::NORMAL, |
| 285 | "(%E1%85%9F)(%E1%85%A0)(%E1%9E%B4)(%E1%9E%B5)"}, |
| 286 | {"(%E1%A0%8B)(%E1%A0%8C)(%E1%A0%8D)(%E1%A0%8E)", UnescapeRule::NORMAL, |
| 287 | "(%E1%A0%8B)(%E1%A0%8C)(%E1%A0%8D)(%E1%A0%8E)"}, |
| 288 | {"(%E2%80%8B)(%E2%80%8C)(%E2%80%8D)(%E2%81%A0)", UnescapeRule::NORMAL, |
| 289 | "(%E2%80%8B)(%E2%80%8C)(%E2%80%8D)(%E2%81%A0)"}, |
| 290 | {"(%E2%81%A1)(%E2%81%A2)(%E2%81%A3)(%E2%81%A4)", UnescapeRule::NORMAL, |
| 291 | "(%E2%81%A1)(%E2%81%A2)(%E2%81%A3)(%E2%81%A4)"}, |
| 292 | {"(%E3%85%A4)(%EF%BB%BF)(%EF%BE%A0)(%EF%BF%B9)", UnescapeRule::NORMAL, |
| 293 | "(%E3%85%A4)(%EF%BB%BF)(%EF%BE%A0)(%EF%BF%B9)"}, |
| 294 | {"(%EF%BF%BB)(%F0%91%82%BD)(%F0%91%83%8D)", UnescapeRule::NORMAL, |
| 295 | "(%EF%BF%BB)(%F0%91%82%BD)(%F0%91%83%8D)"}, |
| 296 | {"(%F0%93%90%B0)(%F0%93%90%B8)", UnescapeRule::NORMAL, |
| 297 | "(%F0%93%90%B0)(%F0%93%90%B8)"}, |
| 298 | // General Punctuation - Deprecated (U+206A--206F) |
| 299 | {"(%E2%81%AA)(%E2%81%AD)(%E2%81%AF)", UnescapeRule::NORMAL, |
| 300 | "(%E2%81%AA)(%E2%81%AD)(%E2%81%AF)"}, |
| 301 | // Variation selectors (U+FE00--FE0F) |
| 302 | {"(%EF%B8%80)(%EF%B8%8C)(%EF%B8%8D)", UnescapeRule::NORMAL, |
| 303 | "(%EF%B8%80)(%EF%B8%8C)(%EF%B8%8D)"}, |
| 304 | // Shorthand format controls (U+1BCA0--1BCA3) |
| 305 | {"(%F0%9B%B2%A0)(%F0%9B%B2%A1)(%F0%9B%B2%A3)", UnescapeRule::NORMAL, |
| 306 | "(%F0%9B%B2%A0)(%F0%9B%B2%A1)(%F0%9B%B2%A3)"}, |
| 307 | // Musical symbols beams and slurs (U+1D173--1D17A) |
| 308 | {"(%F0%9D%85%B3)(%F0%9D%85%B9)(%F0%9D%85%BA)", UnescapeRule::NORMAL, |
| 309 | "(%F0%9D%85%B3)(%F0%9D%85%B9)(%F0%9D%85%BA)"}, |
| 310 | // Tags block (U+E0000--E007F), includes unassigned points |
| 311 | {"(%F3%A0%80%80)(%F3%A0%80%81)(%F3%A0%81%8F)", UnescapeRule::NORMAL, |
| 312 | "(%F3%A0%80%80)(%F3%A0%80%81)(%F3%A0%81%8F)"}, |
| 313 | // Ideographic-specific variation selectors (U+E0100--E01EF) |
| 314 | {"(%F3%A0%84%80)(%F3%A0%84%90)(%F3%A0%87%AF)", UnescapeRule::NORMAL, |
| 315 | "(%F3%A0%84%80)(%F3%A0%84%90)(%F3%A0%87%AF)"}, |
| 316 | |
| 317 | // Two spoofing characters in a row should not be unescaped. |
| 318 | {"%D8%9C%D8%9C", UnescapeRule::NORMAL, "%D8%9C%D8%9C"}, |
| 319 | // Non-spoofing characters surrounded by spoofing characters should be |
| 320 | // unescaped. |
| 321 | {"%D8%9C%C2%A1%D8%9C%C2%A1", UnescapeRule::NORMAL, |
| 322 | "%D8%9C\xC2\xA1%D8%9C\xC2\xA1"}, |
| 323 | // Invalid UTF-8 characters surrounded by spoofing characters should be |
| 324 | // unescaped. |
| 325 | {"%D8%9C%85%D8%9C%85", UnescapeRule::NORMAL, "%D8%9C\x85%D8%9C\x85"}, |
| 326 | // Test with enough trail bytes to overflow the CBU8_MAX_LENGTH-byte |
| 327 | // buffer. The first two bytes are a spoofing character as well. |
| 328 | {"%D8%9C%9C%9C%9C%9C%9C%9C%9C%9C%9C", UnescapeRule::NORMAL, |
| 329 | "%D8%9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C"}, |
| 330 | |
| 331 | {"Some%20random text %25%2dOK", UnescapeRule::SPACES, |
| 332 | "Some random text %25-OK"}, |
| 333 | {"Some%20random text %25%2dOK", UnescapeRule::PATH_SEPARATORS, |
| 334 | "Some%20random text %25-OK"}, |
| 335 | {"Some%20random text %25%2dOK", |
| 336 | UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS, |
| 337 | "Some%20random text %-OK"}, |
| 338 | {"Some%20random text %25%2dOK", |
| 339 | UnescapeRule::SPACES | |
| 340 | UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS, |
| 341 | "Some random text %-OK"}, |
| 342 | {"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, "\xA0\xB1\xC2\xD3\xE4\xF5"}, |
| 343 | {"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, "\xAa\xBb\xCc\xDd\xEe\xFf"}, |
| 344 | // Certain URL-sensitive characters should not be unescaped unless asked. |
| 345 | {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", |
| 346 | UnescapeRule::SPACES, "Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"}, |
| 347 | {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", |
| 348 | UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS, |
| 349 | "Hello%20%13%10world ## ?? == && %% ++"}, |
| 350 | // We can neither escape nor unescape '@' since some websites expect it to |
| 351 | // be preserved as either '@' or "%40". |
| 352 | // See http://b/996720 and http://crbug.com/23933 . |
| 353 | {"me@my%40example", UnescapeRule::NORMAL, "me@my%40example"}, |
| 354 | // Control characters. |
| 355 | {"%01%02%03%04%05%06%07%08%09 %25", |
| 356 | UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS, |
| 357 | "%01%02%03%04%05%06%07%08%09 %"}, |
| 358 | {"Hello%20%13%10%02", UnescapeRule::SPACES, "Hello %13%10%02"}, |
| 359 | |
| 360 | // '/' and '\\' should only be unescaped by PATH_SEPARATORS. |
| 361 | {"%2F%5C", UnescapeRule::PATH_SEPARATORS, "/\\"}, |
| 362 | }; |
| 363 | |
| 364 | for (const auto unescape_case : kUnescapeCases) { |
| 365 | EXPECT_EQ(unescape_case.output, |
| 366 | UnescapeURLComponent(unescape_case.input, unescape_case.rules)); |
| 367 | } |
| 368 | |
| 369 | // Test NULL character unescaping, which can't be tested above since those are |
| 370 | // just char pointers. |
| 371 | std::string input("Null"); |
| 372 | input.push_back(0); // Also have a NULL in the input. |
| 373 | input.append("%00%39Test"); |
| 374 | |
| 375 | std::string expected = "Null"; |
| 376 | expected.push_back(0); |
| 377 | expected.append("%009Test"); |
| 378 | EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); |
| 379 | } |
| 380 | |
| 381 | TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponentWithAdjustments) { |
| 382 | const UnescapeAndDecodeCase unescape_cases[] = { |
| 383 | {"%", "%", "%", L"%"}, |
| 384 | {"+", "+", " ", L"+"}, |
| 385 | {"%2+", "%2+", "%2 ", L"%2+"}, |
| 386 | {"+%%%+%%%", "+%%%+%%%", " %%% %%%", L"+%%%+%%%"}, |
| 387 | {"Don't escape anything", "Don't escape anything", |
| 388 | "Don't escape anything", L"Don't escape anything"}, |
| 389 | {"+Invalid %escape %2+", "+Invalid %escape %2+", " Invalid %escape %2 ", |
| 390 | L"+Invalid %escape %2+"}, |
| 391 | {"Some random text %25%2dOK", "Some random text %25-OK", |
| 392 | "Some random text %25-OK", L"Some random text %25-OK"}, |
| 393 | {"%01%02%03%04%05%06%07%08%09", "%01%02%03%04%05%06%07%08%09", |
| 394 | "%01%02%03%04%05%06%07%08%09", L"%01%02%03%04%05%06%07%08%09"}, |
| 395 | {"%E4%BD%A0+%E5%A5%BD", "\xE4\xBD\xA0+\xE5\xA5\xBD", |
| 396 | "\xE4\xBD\xA0 \xE5\xA5\xBD", L"\x4f60+\x597d"}, |
| 397 | {"%ED%ED", // Invalid UTF-8. |
| 398 | "\xED\xED", "\xED\xED", L"%ED%ED"}, // Invalid UTF-8 -> kept unescaped. |
| 399 | }; |
| 400 | |
| 401 | for (const auto& unescape_case : unescape_cases) { |
| 402 | std::string unescaped = |
| 403 | UnescapeURLComponent(unescape_case.input, UnescapeRule::NORMAL); |
| 404 | EXPECT_EQ(std::string(unescape_case.url_unescaped), unescaped); |
| 405 | |
| 406 | unescaped = UnescapeURLComponent(unescape_case.input, |
| 407 | UnescapeRule::REPLACE_PLUS_WITH_SPACE); |
| 408 | EXPECT_EQ(std::string(unescape_case.query_unescaped), unescaped); |
| 409 | |
| 410 | // The adjustments argument is covered by the next test. |
| 411 | // |
| 412 | // TODO: Need to test unescape_spaces and unescape_percent. |
| Jan Wilken Dörrie | 085b2aa | 2021-03-12 16:26:57 | [diff] [blame] | 413 | std::u16string decoded = UnescapeAndDecodeUTF8URLComponentWithAdjustments( |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 414 | unescape_case.input, UnescapeRule::NORMAL, nullptr); |
| 415 | EXPECT_EQ(WideToUTF16(unescape_case.decoded), decoded); |
| 416 | } |
| 417 | } |
| 418 | |
| 419 | TEST(EscapeTest, AdjustOffset) { |
| 420 | const AdjustOffsetCase adjust_cases[] = { |
| 421 | {"", 0, 0}, |
| 422 | {"test", 0, 0}, |
| 423 | {"test", 2, 2}, |
| 424 | {"test", 4, 4}, |
| 425 | {"test", std::string::npos, std::string::npos}, |
| 426 | {"%2dtest", 6, 4}, |
| 427 | {"%2dtest", 3, 1}, |
| 428 | {"%2dtest", 2, std::string::npos}, |
| 429 | {"%2dtest", 1, std::string::npos}, |
| 430 | {"%2dtest", 0, 0}, |
| 431 | {"test%2d", 2, 2}, |
| 432 | {"test%2e", 2, 2}, |
| 433 | {"%E4%BD%A0+%E5%A5%BD", 9, 1}, |
| 434 | {"%E4%BD%A0+%E5%A5%BD", 6, std::string::npos}, |
| 435 | {"%E4%BD%A0+%E5%A5%BD", 0, 0}, |
| 436 | {"%E4%BD%A0+%E5%A5%BD", 10, 2}, |
| 437 | {"%E4%BD%A0+%E5%A5%BD", 19, 3}, |
| 438 | |
| 439 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 18, 8}, |
| 440 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 15, std::string::npos}, |
| 441 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 9, 7}, |
| 442 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 19, 9}, |
| 443 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 28, 10}, |
| 444 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 0, 0}, |
| 445 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 2, 2}, |
| 446 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 3, std::string::npos}, |
| 447 | {"hi%41test%E4%BD%A0+%E5%A5%BD", 5, 3}, |
| 448 | |
| 449 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 9, 1}, |
| 450 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 6, std::string::npos}, |
| 451 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 0, 0}, |
| 452 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 10, 2}, |
| 453 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 19, 3}, |
| 454 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 21, 5}, |
| 455 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 22, std::string::npos}, |
| 456 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 24, 6}, |
| 457 | {"%E4%BD%A0+%E5%A5%BDhi%41test", 28, 10}, |
| 458 | |
| 459 | {"%ED%B0%80+%E5%A5%BD", 6, 6}, // not convertible to UTF-8 |
| 460 | }; |
| 461 | |
| 462 | for (const auto& adjust_case : adjust_cases) { |
| 463 | size_t offset = adjust_case.input_offset; |
| 464 | OffsetAdjuster::Adjustments adjustments; |
| 465 | UnescapeAndDecodeUTF8URLComponentWithAdjustments( |
| 466 | adjust_case.input, UnescapeRule::NORMAL, &adjustments); |
| 467 | OffsetAdjuster::AdjustOffset(adjustments, &offset); |
| 468 | EXPECT_EQ(adjust_case.output_offset, offset) |
| 469 | << "input=" << adjust_case.input |
| 470 | << " offset=" << adjust_case.input_offset; |
| 471 | } |
| 472 | } |
| 473 | |
| 474 | TEST(EscapeTest, UnescapeBinaryURLComponent) { |
| 475 | const UnescapeURLCase kTestCases[] = { |
| 476 | // Check that ASCII characters with special handling in |
| 477 | // UnescapeURLComponent() are still unescaped. |
| 478 | {"%09%20%25foo%2F", UnescapeRule::NORMAL, "\x09 %foo/"}, |
| 479 | |
| 480 | // UTF-8 Characters banned by UnescapeURLComponent() should also be |
| 481 | // unescaped. |
| 482 | {"Some random text %D8%9COK", UnescapeRule::NORMAL, |
| 483 | "Some random text \xD8\x9COK"}, |
| 484 | {"Some random text %F0%9F%94%8FOK", UnescapeRule::NORMAL, |
| 485 | "Some random text \xF0\x9F\x94\x8FOK"}, |
| 486 | |
| 487 | // As should invalid UTF-8 characters. |
| 488 | {"%A0%A0%E9%E9%A0%A0%A0%A0", UnescapeRule::NORMAL, |
| 489 | "\xA0\xA0\xE9\xE9\xA0\xA0\xA0\xA0"}, |
| 490 | |
| 491 | // And valid UTF-8 characters that are not banned by |
| 492 | // UnescapeURLComponent() should be unescaped, too! |
| 493 | {"%C2%A1%C2%A1", UnescapeRule::NORMAL, "\xC2\xA1\xC2\xA1"}, |
| 494 | |
| 495 | // '+' should be left alone by default |
| 496 | {"++%2B++", UnescapeRule::NORMAL, "+++++"}, |
| 497 | // But should magically be turned into a space if requested. |
| 498 | {"++%2B++", UnescapeRule::REPLACE_PLUS_WITH_SPACE, " + "}, |
| 499 | }; |
| 500 | |
| 501 | for (const auto& test_case : kTestCases) { |
| 502 | EXPECT_EQ(test_case.output, |
| 503 | UnescapeBinaryURLComponent(test_case.input, test_case.rules)); |
| 504 | } |
| 505 | |
| 506 | // Test NULL character unescaping, which can't be tested above since those are |
| 507 | // just char pointers. |
| 508 | std::string input("Null"); |
| 509 | input.push_back(0); // Also have a NULL in the input. |
| 510 | input.append("%00%39Test"); |
| 511 | |
| 512 | std::string expected("Null"); |
| 513 | expected.push_back(0); |
| 514 | expected.push_back(0); |
| 515 | expected.append("9Test"); |
| 516 | EXPECT_EQ(expected, UnescapeBinaryURLComponent(input)); |
| 517 | } |
| 518 | |
| 519 | TEST(EscapeTest, UnescapeBinaryURLComponentSafe) { |
| 520 | const struct TestCase { |
| 521 | const char* input; |
| 522 | // Expected output. Null if call is expected to fail when |
| 523 | // |fail_on_path_separators| is false. |
| 524 | const char* expected_output; |
| 525 | // Whether |input| has any escaped path separators. |
| 526 | bool has_path_separators; |
| 527 | } kTestCases[] = { |
| 528 | // Spaces, percents, and invalid UTF-8 characters are all successfully |
| 529 | // unescaped. |
| 530 | {"%20%25foo%81", " %foo\x81", false}, |
| 531 | |
| 532 | // Characters disallowed unconditionally. |
| 533 | {"foo%00", nullptr, false}, |
| 534 | {"foo%01", nullptr, false}, |
| 535 | {"foo%0A", nullptr, false}, |
| 536 | {"foo%0D", nullptr, false}, |
| 537 | |
| 538 | // Path separators. |
| 539 | {"foo%2F", "foo/", true}, |
| 540 | {"foo%5C", "foo\\", true}, |
| 541 | |
| 542 | // Characters that are considered invalid to escape are ignored if passed |
| 543 | // in unescaped. |
| 544 | {"foo\x01\r/\\", "foo\x01\r/\\", false}, |
| 545 | }; |
| 546 | |
| 547 | for (const auto& test_case : kTestCases) { |
| 548 | SCOPED_TRACE(test_case.input); |
| 549 | |
| 550 | std::string output = "foo"; |
| 551 | if (!test_case.expected_output) { |
| 552 | EXPECT_FALSE(UnescapeBinaryURLComponentSafe( |
| 553 | test_case.input, false /* fail_on_path_separators */, &output)); |
| 554 | EXPECT_TRUE(output.empty()); |
| 555 | EXPECT_FALSE(UnescapeBinaryURLComponentSafe( |
| 556 | test_case.input, true /* fail_on_path_separators */, &output)); |
| 557 | EXPECT_TRUE(output.empty()); |
| 558 | continue; |
| 559 | } |
| 560 | EXPECT_TRUE(UnescapeBinaryURLComponentSafe( |
| 561 | test_case.input, false /* fail_on_path_separators */, &output)); |
| 562 | EXPECT_EQ(test_case.expected_output, output); |
| 563 | if (test_case.has_path_separators) { |
| 564 | EXPECT_FALSE(UnescapeBinaryURLComponentSafe( |
| 565 | test_case.input, true /* fail_on_path_separators */, &output)); |
| 566 | EXPECT_TRUE(output.empty()); |
| 567 | } else { |
| 568 | output = "foo"; |
| 569 | EXPECT_TRUE(UnescapeBinaryURLComponentSafe( |
| 570 | test_case.input, true /* fail_on_path_separators */, &output)); |
| 571 | EXPECT_EQ(test_case.expected_output, output); |
| 572 | } |
| 573 | } |
| 574 | } |
| 575 | |
| 576 | TEST(EscapeTest, ContainsEncodedBytes) { |
| 577 | EXPECT_FALSE(ContainsEncodedBytes("abc/def", {'/', '\\'})); |
| 578 | EXPECT_FALSE(ContainsEncodedBytes("abc%2Fdef", {'%'})); |
| 579 | EXPECT_TRUE(ContainsEncodedBytes("abc%252Fdef", {'%'})); |
| 580 | EXPECT_TRUE(ContainsEncodedBytes("abc%2Fdef", {'/', '\\'})); |
| 581 | EXPECT_TRUE(ContainsEncodedBytes("abc%5Cdef", {'/', '\\'})); |
| 582 | EXPECT_TRUE(ContainsEncodedBytes("abc%2fdef", {'/', '\\'})); |
| 583 | |
| 584 | // Should be looking for byte values, not UTF-8 character values. |
| Peter Kasting | c1eb091 | 2021-06-09 17:22:05 | [diff] [blame] | 585 | EXPECT_TRUE( |
| 586 | ContainsEncodedBytes("caf%C3%A9", {static_cast<uint8_t>('\xc3')})); |
| 587 | EXPECT_FALSE( |
| 588 | ContainsEncodedBytes("caf%C3%A9", {static_cast<uint8_t>('\xe9')})); |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 589 | } |
| 590 | |
| Victor Vasiliev | dc7e817 | 2022-04-20 23:45:20 | [diff] [blame] | 591 | } // namespace |
| Weilun Shi | 40194033 | 2020-07-14 22:22:33 | [diff] [blame] | 592 | } // namespace base |