diff --git a/llvm/include/llvm/Support/YAMLParser.h b/llvm/include/llvm/Support/YAMLParser.h index f4767641647c..9d95a1e13a0d 100644 --- a/llvm/include/llvm/Support/YAMLParser.h +++ b/llvm/include/llvm/Support/YAMLParser.h @@ -240,9 +240,14 @@ public: private: StringRef Value; - StringRef unescapeDoubleQuoted(StringRef UnquotedValue, - StringRef::size_type Start, + StringRef getDoubleQuotedValue(StringRef UnquotedValue, SmallVectorImpl &Storage) const; + + static StringRef getSingleQuotedValue(StringRef RawValue, + SmallVectorImpl &Storage); + + static StringRef getPlainValue(StringRef RawValue, + SmallVectorImpl &Storage); }; /// A block scalar node is an opaque datum that can be presented as a diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp index b47cb3ae3b44..fdd0ed6e682e 100644 --- a/llvm/lib/Support/YAMLParser.cpp +++ b/llvm/lib/Support/YAMLParser.cpp @@ -2030,186 +2030,231 @@ bool Node::failed() const { } StringRef ScalarNode::getValue(SmallVectorImpl &Storage) const { - // TODO: Handle newlines properly. We need to remove leading whitespace. - if (Value[0] == '"') { // Double quoted. - // Pull off the leading and trailing "s. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - // Search for characters that would require unescaping the value. - StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); - if (i != StringRef::npos) - return unescapeDoubleQuoted(UnquotedValue, i, Storage); - return UnquotedValue; - } else if (Value[0] == '\'') { // Single quoted. - // Pull off the leading and trailing 's. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - StringRef::size_type i = UnquotedValue.find('\''); - if (i != StringRef::npos) { - // We're going to need Storage. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { - StringRef Valid(UnquotedValue.begin(), i); - llvm::append_range(Storage, Valid); - Storage.push_back('\''); - UnquotedValue = UnquotedValue.substr(i + 2); - } - llvm::append_range(Storage, UnquotedValue); - return StringRef(Storage.begin(), Storage.size()); - } - return UnquotedValue; - } - // Plain. - // Trim whitespace ('b-char' and 's-white'). - // NOTE: Alternatively we could change the scanner to not include whitespace - // here in the first place. - return Value.rtrim("\x0A\x0D\x20\x09"); + if (Value[0] == '"') + return getDoubleQuotedValue(Value, Storage); + if (Value[0] == '\'') + return getSingleQuotedValue(Value, Storage); + return getPlainValue(Value, Storage); } -StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue - , StringRef::size_type i - , SmallVectorImpl &Storage) - const { - // Use Storage to build proper value. +/// parseScalarValue - A common parsing routine for all flow scalar styles. +/// It handles line break characters by itself, adds regular content characters +/// to the result, and forwards escaped sequences to the provided routine for +/// the style-specific processing. +/// +/// \param UnquotedValue - An input value without quotation marks. +/// \param Storage - A storage for the result if the input value is multiline or +/// contains escaped characters. +/// \param LookupChars - A set of special characters to search in the input +/// string. Should include line break characters and the escape character +/// specific for the processing scalar style, if any. +/// \param UnescapeCallback - This is called when the escape character is found +/// in the input. +/// \returns - The unfolded and unescaped value. +static StringRef +parseScalarValue(StringRef UnquotedValue, SmallVectorImpl &Storage, + StringRef LookupChars, + std::function &)> + UnescapeCallback) { + size_t I = UnquotedValue.find_first_of(LookupChars); + if (I == StringRef::npos) + return UnquotedValue; + Storage.clear(); Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { - // Insert all previous chars into Storage. - StringRef Valid(UnquotedValue.begin(), i); - llvm::append_range(Storage, Valid); - // Chop off inserted chars. - UnquotedValue = UnquotedValue.substr(i); - - assert(!UnquotedValue.empty() && "Can't be empty!"); - - // Parse escape or line break. - switch (UnquotedValue[0]) { - case '\r': - case '\n': - Storage.push_back('\n'); - if ( UnquotedValue.size() > 1 - && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) - UnquotedValue = UnquotedValue.substr(1); - UnquotedValue = UnquotedValue.substr(1); - break; - default: - if (UnquotedValue.size() == 1) { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code", T); - return ""; - } - UnquotedValue = UnquotedValue.substr(1); - switch (UnquotedValue[0]) { - default: { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code", T); - return ""; - } - case '\r': - // Shrink the Windows-style EOL. - if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') - UnquotedValue = UnquotedValue.drop_front(1); - [[fallthrough]]; - case '\n': - UnquotedValue = UnquotedValue.drop_front(1).ltrim(" \t"); - continue; - case '0': - Storage.push_back(0x00); - break; - case 'a': - Storage.push_back(0x07); - break; - case 'b': - Storage.push_back(0x08); - break; - case 't': - case 0x09: - Storage.push_back(0x09); - break; - case 'n': - Storage.push_back(0x0A); - break; - case 'v': - Storage.push_back(0x0B); - break; - case 'f': - Storage.push_back(0x0C); - break; - case 'r': - Storage.push_back(0x0D); - break; - case 'e': - Storage.push_back(0x1B); - break; - case ' ': - Storage.push_back(0x20); - break; - case '"': - Storage.push_back(0x22); - break; - case '/': - Storage.push_back(0x2F); - break; - case '\\': - Storage.push_back(0x5C); - break; - case 'N': - encodeUTF8(0x85, Storage); - break; - case '_': - encodeUTF8(0xA0, Storage); - break; - case 'L': - encodeUTF8(0x2028, Storage); - break; - case 'P': - encodeUTF8(0x2029, Storage); - break; - case 'x': { - if (UnquotedValue.size() < 3) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(2); - break; - } - case 'u': { - if (UnquotedValue.size() < 5) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(4); - break; - } - case 'U': { - if (UnquotedValue.size() < 9) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(8); - break; - } - } - UnquotedValue = UnquotedValue.substr(1); + char LastNewLineAddedAs = '\0'; + for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) { + if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') { + llvm::append_range(Storage, UnquotedValue.take_front(I)); + UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage); + LastNewLineAddedAs = '\0'; + continue; } + if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I); + LastNonSWhite != StringRef::npos) { + llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1)); + Storage.push_back(' '); + LastNewLineAddedAs = ' '; + } else { + // Note: we can't just check if the last character in Storage is ' ', + // '\n', or something else; that would give a wrong result for double + // quoted values containing an escaped space character before a new-line + // character. + switch (LastNewLineAddedAs) { + case ' ': + assert(!Storage.empty() && Storage.back() == ' '); + Storage.back() = '\n'; + LastNewLineAddedAs = '\n'; + break; + case '\n': + assert(!Storage.empty() && Storage.back() == '\n'); + Storage.push_back('\n'); + break; + default: + Storage.push_back(' '); + LastNewLineAddedAs = ' '; + break; + } + } + // Handle Windows-style EOL + if (UnquotedValue.substr(I, 2) == "\r\n") + I++; + UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t"); } llvm::append_range(Storage, UnquotedValue); return StringRef(Storage.begin(), Storage.size()); } +StringRef +ScalarNode::getDoubleQuotedValue(StringRef RawValue, + SmallVectorImpl &Storage) const { + assert(RawValue.size() >= 2 && RawValue.front() == '"' && + RawValue.back() == '"'); + StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); + + auto UnescapeFunc = [this](StringRef UnquotedValue, + SmallVectorImpl &Storage) { + assert(UnquotedValue.take_front(1) == "\\"); + if (UnquotedValue.size() == 1) { + Token T; + T.Range = UnquotedValue; + setError("Unrecognized escape code", T); + Storage.clear(); + return StringRef(); + } + UnquotedValue = UnquotedValue.drop_front(1); + switch (UnquotedValue[0]) { + default: { + Token T; + T.Range = UnquotedValue.take_front(1); + setError("Unrecognized escape code", T); + Storage.clear(); + return StringRef(); + } + case '\r': + // Shrink the Windows-style EOL. + if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') + UnquotedValue = UnquotedValue.drop_front(1); + [[fallthrough]]; + case '\n': + return UnquotedValue.drop_front(1).ltrim(" \t"); + case '0': + Storage.push_back(0x00); + break; + case 'a': + Storage.push_back(0x07); + break; + case 'b': + Storage.push_back(0x08); + break; + case 't': + case 0x09: + Storage.push_back(0x09); + break; + case 'n': + Storage.push_back(0x0A); + break; + case 'v': + Storage.push_back(0x0B); + break; + case 'f': + Storage.push_back(0x0C); + break; + case 'r': + Storage.push_back(0x0D); + break; + case 'e': + Storage.push_back(0x1B); + break; + case ' ': + Storage.push_back(0x20); + break; + case '"': + Storage.push_back(0x22); + break; + case '/': + Storage.push_back(0x2F); + break; + case '\\': + Storage.push_back(0x5C); + break; + case 'N': + encodeUTF8(0x85, Storage); + break; + case '_': + encodeUTF8(0xA0, Storage); + break; + case 'L': + encodeUTF8(0x2028, Storage); + break; + case 'P': + encodeUTF8(0x2029, Storage); + break; + case 'x': { + if (UnquotedValue.size() < 3) + // TODO: Report error. + break; + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(3); + } + case 'u': { + if (UnquotedValue.size() < 5) + // TODO: Report error. + break; + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(5); + } + case 'U': { + if (UnquotedValue.size() < 9) + // TODO: Report error. + break; + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(9); + } + } + return UnquotedValue.drop_front(1); + }; + + return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc); +} + +StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue, + SmallVectorImpl &Storage) { + assert(RawValue.size() >= 2 && RawValue.front() == '\'' && + RawValue.back() == '\''); + StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); + + auto UnescapeFunc = [](StringRef UnquotedValue, + SmallVectorImpl &Storage) { + assert(UnquotedValue.take_front(2) == "''"); + Storage.push_back('\''); + return UnquotedValue.drop_front(2); + }; + + return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc); +} + +StringRef ScalarNode::getPlainValue(StringRef RawValue, + SmallVectorImpl &Storage) { + // Trim trailing whitespace ('b-char' and 's-white'). + // NOTE: Alternatively we could change the scanner to not include whitespace + // here in the first place. + RawValue = RawValue.rtrim("\r\n \t"); + return parseScalarValue(RawValue, Storage, "\r\n", nullptr); +} + Node *KeyValueNode::getKey() { if (Key) return Key; diff --git a/llvm/test/YAMLParser/spec-05-13.test b/llvm/test/YAMLParser/spec-05-13.test index e7ec42a4aaa8..b2367a373ee4 100644 --- a/llvm/test/YAMLParser/spec-05-13.test +++ b/llvm/test/YAMLParser/spec-05-13.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "Text containing \n both space and\t\n \ttab\tcharacters" +# CHECK: "Text containing both space and tab\tcharacters" "Text containing both space and diff --git a/llvm/test/YAMLParser/spec-05-14.test b/llvm/test/YAMLParser/spec-05-14.test index 984f3721312a..87d699dbc027 100644 --- a/llvm/test/YAMLParser/spec-05-14.test +++ b/llvm/test/YAMLParser/spec-05-14.test @@ -6,4 +6,4 @@ \ \_ \N \L \P \ \x41 \u0041 \U00000041" -# CHECK: !!str "Fun with \\\n\" \a \b \e \f \n \r \t \v \0 \_ \N \L \P A A A" +# CHECK: !!str "Fun with \\ \" \a \b \e \f \n \r \t \v \0 \_ \N \L \P A A A" diff --git a/llvm/test/YAMLParser/spec-09-01.test b/llvm/test/YAMLParser/spec-09-01.test index 2b5a6f31166d..e552e7ca2644 100644 --- a/llvm/test/YAMLParser/spec-09-01.test +++ b/llvm/test/YAMLParser/spec-09-01.test @@ -4,8 +4,8 @@ # CHECK-NEXT: : !!map { # CHECK-NEXT: ? !!str "also simple" # CHECK-NEXT: : !!str "value", -# CHECK-NEXT: ? !!str "not a\n simple key" -# CHECK-NEXT: : !!str "any\n value", +# CHECK-NEXT: ? !!str "not a simple key" +# CHECK-NEXT: : !!str "any value", # CHECK-NEXT: }, # CHECK-NEXT: } diff --git a/llvm/test/YAMLParser/spec-09-02.test b/llvm/test/YAMLParser/spec-09-02.test index 51ea61dd2327..99c836bf0047 100644 --- a/llvm/test/YAMLParser/spec-09-02.test +++ b/llvm/test/YAMLParser/spec-09-02.test @@ -1,12 +1,24 @@ # RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s --strict-whitespace -# CHECK: "as space\n trimmed \n specific\L\n escaped\t\n none" +# CHECK: "as space trimmed\nspecific\L escaped\t none" ## Note: The example was originally taken from Spec 1.1, but the parsing rules ## have been changed since then. -## * The paragraph-separator character '\u2029' is excluded from line-break +## * The line-separator character '\u2028' is no longer considered a line-break +## character, so the line "...specific\u2028\nescaped..." is now parsed as +## "...specific\L escaped...". +## * The paragraph-separator character '\u2029' is also excluded from line-break ## characters, so the original sequence "escaped\t\\\u2029" is no longer -## considered valid. This is replaced by "escaped\t\\\n" in the test source. +## considered valid. This is replaced by "escaped\t\\\n" in the test source, +# so the output has changed as well. ## See https://yaml.org/spec/1.2.2/ext/changes/ for details. +## +## Note 2: Different parsers handle this corner case example differently. +## * https://github.com/yaml/libyaml: +## "as space trimmed\nspecific\L\nescaped\t\nnone" +## * https://github.com/yaml/yaml-reference-parser (parser-1.2): +## "as space trimmed\nspecific\L escaped\t none" +## * https://github.com/yaml/yaml-reference-parser (parser-1.3): +## "as space trimmed\nspecific
 escaped\t none" "as space trimmed diff --git a/llvm/test/YAMLParser/spec-09-03.test b/llvm/test/YAMLParser/spec-09-03.test index c656058b7ff8..f067d1366f06 100644 --- a/llvm/test/YAMLParser/spec-09-03.test +++ b/llvm/test/YAMLParser/spec-09-03.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "\n last", -# CHECK-NEXT: !!str " \t\n last", -# CHECK-NEXT: !!str " \tfirst\n last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " \tfirst last", # CHECK-NEXT: ] - " diff --git a/llvm/test/YAMLParser/spec-09-04.test b/llvm/test/YAMLParser/spec-09-04.test index e4f77ea83c7a..79af877b38c8 100644 --- a/llvm/test/YAMLParser/spec-09-04.test +++ b/llvm/test/YAMLParser/spec-09-04.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "first\n \tinner 1\t\n inner 2 last" +# CHECK: "first inner 1 inner 2 last" "first inner 1 diff --git a/llvm/test/YAMLParser/spec-09-05.test b/llvm/test/YAMLParser/spec-09-05.test index 5eb5b22f421d..4a748e609f1d 100644 --- a/llvm/test/YAMLParser/spec-09-05.test +++ b/llvm/test/YAMLParser/spec-09-05.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "first\n \t", -# CHECK-NEXT: !!str "first\n \tlast", -# CHECK-NEXT: !!str "first\n inner\n \tlast", +# CHECK-NEXT: !!str "first ", +# CHECK-NEXT: !!str "first\nlast", +# CHECK-NEXT: !!str "first inner \tlast", # CHECK-NEXT: ] - "first diff --git a/llvm/test/YAMLParser/spec-09-07.test b/llvm/test/YAMLParser/spec-09-07.test index 71007e79b79d..f397e2ca5f41 100644 --- a/llvm/test/YAMLParser/spec-09-07.test +++ b/llvm/test/YAMLParser/spec-09-07.test @@ -4,8 +4,8 @@ # CHECK-NEXT: : !!map { # CHECK-NEXT: ? !!str "also simple" # CHECK-NEXT: : !!str "value", -# CHECK-NEXT: ? !!str "not a\n simple key" -# CHECK-NEXT: : !!str "any\n value", +# CHECK-NEXT: ? !!str "not a simple key" +# CHECK-NEXT: : !!str "any value", # CHECK-NEXT: }, # CHECK-NEXT: } diff --git a/llvm/test/YAMLParser/spec-09-08.test b/llvm/test/YAMLParser/spec-09-08.test index 5d1f13b0e31d..7ed436ecb7ce 100644 --- a/llvm/test/YAMLParser/spec-09-08.test +++ b/llvm/test/YAMLParser/spec-09-08.test @@ -1,5 +1,11 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "as space\t\n trimmed \n \n specific\L\n none" +# CHECK: "as space trimmed\nspecific\L none" + +## Note: The parsing rules were changed in version 1.2 and the line-separator +## character is no longer considered a line-break character. The example is +## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as +## in the original edition. +## See https://yaml.org/spec/1.2.2/ext/changes/ for details. 'as space trimmed diff --git a/llvm/test/YAMLParser/spec-09-09.test b/llvm/test/YAMLParser/spec-09-09.test index 181971bd1349..4910b66c24b1 100644 --- a/llvm/test/YAMLParser/spec-09-09.test +++ b/llvm/test/YAMLParser/spec-09-09.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "\n last", -# CHECK-NEXT: !!str " \t\n last", -# CHECK-NEXT: !!str " \tfirst\n last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " \tfirst last", # CHECK-NEXT: ] - ' diff --git a/llvm/test/YAMLParser/spec-09-10.test b/llvm/test/YAMLParser/spec-09-10.test index f75834fa4dda..3e21afe22d34 100644 --- a/llvm/test/YAMLParser/spec-09-10.test +++ b/llvm/test/YAMLParser/spec-09-10.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "first\n \tinner\t\n last" +# CHECK: "first inner last" 'first inner diff --git a/llvm/test/YAMLParser/spec-09-11.test b/llvm/test/YAMLParser/spec-09-11.test index b1f8f45f954a..62bc1927998b 100644 --- a/llvm/test/YAMLParser/spec-09-11.test +++ b/llvm/test/YAMLParser/spec-09-11.test @@ -1,7 +1,7 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "first\n \t", -# CHECK-NEXT: !!str "first\n\n \tlast", +# CHECK-NEXT: !!str "first ", +# CHECK-NEXT: !!str "first\nlast", # CHECK-NEXT: ] - 'first diff --git a/llvm/test/YAMLParser/spec-09-13.test b/llvm/test/YAMLParser/spec-09-13.test index 015f38951ebb..f2a5f49ea0c6 100644 --- a/llvm/test/YAMLParser/spec-09-13.test +++ b/llvm/test/YAMLParser/spec-09-13.test @@ -4,8 +4,8 @@ # CHECK-NEXT: : !!map { # CHECK-NEXT: ? !!str "also simple" # CHECK-NEXT: : !!str "value", -# CHECK-NEXT: ? !!str "not a\n simple key" -# CHECK-NEXT: : !!str "any\n value", +# CHECK-NEXT: ? !!str "not a simple key" +# CHECK-NEXT: : !!str "any value", # CHECK-NEXT: }, # CHECK-NEXT: } diff --git a/llvm/test/YAMLParser/spec-09-16.test b/llvm/test/YAMLParser/spec-09-16.test index b1f52ce194f1..b6c92e3ec63c 100644 --- a/llvm/test/YAMLParser/spec-09-16.test +++ b/llvm/test/YAMLParser/spec-09-16.test @@ -1,5 +1,11 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "as space\t\n trimmed \n\n specific\L\n none" +# CHECK: "as space trimmed\nspecific\L none" + +## Note: The parsing rules were changed in version 1.2 and the line-separator +## character is no longer considered a line-break character. The example is +## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as +## in the original edition. +## See https://yaml.org/spec/1.2.2/ext/changes/ for details. as space trimmed diff --git a/llvm/test/YAMLParser/spec-09-17.test b/llvm/test/YAMLParser/spec-09-17.test index 425925774d92..06f1db212027 100644 --- a/llvm/test/YAMLParser/spec-09-17.test +++ b/llvm/test/YAMLParser/spec-09-17.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "first line \n \n more line" +# CHECK: "first line\nmore line" first line diff --git a/llvm/test/YAMLParser/spec-10-02.test b/llvm/test/YAMLParser/spec-10-02.test index 9adddc9237d5..2fd91040af26 100644 --- a/llvm/test/YAMLParser/spec-10-02.test +++ b/llvm/test/YAMLParser/spec-10-02.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "double\n quoted", -# CHECK-NEXT: !!str "single\n quoted", -# CHECK-NEXT: !!str "plain\n text", +# CHECK-NEXT: !!str "double quoted", +# CHECK-NEXT: !!str "single quoted", +# CHECK-NEXT: !!str "plain text", # CHECK-NEXT: !!seq [ # CHECK-NEXT: !!str "nested", # CHECK-NEXT: ], diff --git a/llvm/test/YAMLParser/spec1.2-07-05.test b/llvm/test/YAMLParser/spec1.2-07-05.test index f923f68d0429..a273e79acef6 100644 --- a/llvm/test/YAMLParser/spec1.2-07-05.test +++ b/llvm/test/YAMLParser/spec1.2-07-05.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "folded \nto a space,\t\n \nto a line feed, or \t \tnon-content" +# CHECK: "folded to a space,\nto a line feed, or \t \tnon-content" "folded to a space, diff --git a/llvm/test/YAMLParser/spec1.2-07-06.test b/llvm/test/YAMLParser/spec1.2-07-06.test index 8982c1ed2a7b..7008bbcf1516 100644 --- a/llvm/test/YAMLParser/spec1.2-07-06.test +++ b/llvm/test/YAMLParser/spec1.2-07-06.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: " 1st non-empty\n 2nd non-empty \n\t3rd non-empty " +# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty " " 1st non-empty diff --git a/llvm/test/YAMLParser/spec1.2-07-09.test b/llvm/test/YAMLParser/spec1.2-07-09.test index 38d541973bc4..6a71f8c8ad89 100644 --- a/llvm/test/YAMLParser/spec1.2-07-09.test +++ b/llvm/test/YAMLParser/spec1.2-07-09.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: " 1st non-empty\n\n 2nd non-empty \n\t3rd non-empty " +# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty " ' 1st non-empty diff --git a/llvm/test/YAMLParser/spec1.2-07-12.test b/llvm/test/YAMLParser/spec1.2-07-12.test index 84d986e29d51..b5d0cb91f302 100644 --- a/llvm/test/YAMLParser/spec1.2-07-12.test +++ b/llvm/test/YAMLParser/spec1.2-07-12.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "1st non-empty\n\n 2nd non-empty \n\t3rd non-empty" +# CHECK: "1st non-empty\n2nd non-empty 3rd non-empty" 1st non-empty diff --git a/llvm/unittests/Support/YAMLParserTest.cpp b/llvm/unittests/Support/YAMLParserTest.cpp index 247e70756861..7bd11e748155 100644 --- a/llvm/unittests/Support/YAMLParserTest.cpp +++ b/llvm/unittests/Support/YAMLParserTest.cpp @@ -441,4 +441,106 @@ TEST(YAMLParser, ParsesBools) { expectCannotParseBool("0"); } +// Checks that the given string can be parsed into an expected scalar value. +static void expectCanParseScalar(StringRef Input, StringRef Expected) { + SourceMgr SM; + yaml::Stream Stream(Input, SM); + yaml::Node *Root = Stream.begin()->getRoot(); + ASSERT_NE(Root, nullptr); + auto *ScalarNode = dyn_cast(Root); + ASSERT_NE(ScalarNode, nullptr); + SmallVector Storage; + StringRef Result = ScalarNode->getValue(Storage); + EXPECT_EQ(Result, Expected); +} + +TEST(YAMLParser, UnfoldsScalarValue) { + // Double-quoted values + expectCanParseScalar("\"\"", ""); + expectCanParseScalar("\" \t\t \t\t \"", " \t\t \t\t "); + expectCanParseScalar("\"\n\"", " "); + expectCanParseScalar("\"\r\"", " "); + expectCanParseScalar("\"\r\n\"", " "); + expectCanParseScalar("\"\n\n\"", "\n"); + expectCanParseScalar("\"\r\r\"", "\n"); + expectCanParseScalar("\"\n\r\"", "\n"); + expectCanParseScalar("\"\r\n\r\n\"", "\n"); + expectCanParseScalar("\"\n\n\n\"", "\n\n"); + expectCanParseScalar("\"\r\r\r\"", "\n\n"); + expectCanParseScalar("\"\r\n\r\n\r\n\"", "\n\n"); + expectCanParseScalar("\" \t \t \n\t \t \t\r \t \t \"", "\n"); + expectCanParseScalar("\" \t A \t \n \t B \t \"", " \t A B \t "); + expectCanParseScalar("\" \t \\ \r\r\t \\ \t \"", " \t \n \t "); + expectCanParseScalar("\"A\nB\"", "A B"); + expectCanParseScalar("\"A\rB\"", "A B"); + expectCanParseScalar("\"A\r\nB\"", "A B"); + expectCanParseScalar("\"A\n\nB\"", "A\nB"); + expectCanParseScalar("\"A\r\rB\"", "A\nB"); + expectCanParseScalar("\"A\n\rB\"", "A\nB"); + expectCanParseScalar("\"A\r\n\r\nB\"", "A\nB"); + expectCanParseScalar("\"A\n\n\nB\"", "A\n\nB"); + expectCanParseScalar("\"A\r\r\rB\"", "A\n\nB"); + expectCanParseScalar("\"A\r\n\r\n\r\nB\"", "A\n\nB"); + expectCanParseScalar("\"A \t \t \n\t \t \t B\"", "A B"); + expectCanParseScalar("\"A \t \t \n\t \t \t\r \t \t B\"", "A\nB"); + expectCanParseScalar("\"A \t \t \n\t \t \t\r\n \t \r \t B\"", "A\n\nB"); + expectCanParseScalar("\"A\\\rB\"", "AB"); + expectCanParseScalar("\"A\\\nB\"", "AB"); + expectCanParseScalar("\"A\\\r\nB\"", "AB"); + expectCanParseScalar("\"A \t \\\rB\"", "A \t B"); + expectCanParseScalar("\"A \t\\\nB\"", "A \tB"); + expectCanParseScalar("\"A\t \\\r\nB\"", "A\t B"); + expectCanParseScalar("\"A\\\r\rB\"", "A B"); + expectCanParseScalar("\"A\\\n\nB\"", "A B"); + expectCanParseScalar("\"A\\\r\n\r\nB\"", "A B"); + expectCanParseScalar("\"A\\\r\r\rB\"", "A\nB"); + expectCanParseScalar("\"A\\\n\n\nB\"", "A\nB"); + expectCanParseScalar("\"A\\\r\n\r\n\r\nB\"", "A\nB"); + expectCanParseScalar("\"A\r\\ \rB\"", "A B"); + // Single-quoted values + expectCanParseScalar("''", ""); + expectCanParseScalar("' \t\t \t\t '", " \t\t \t\t "); + expectCanParseScalar("'\n'", " "); + expectCanParseScalar("'\r'", " "); + expectCanParseScalar("'\r\n'", " "); + expectCanParseScalar("'\n\n'", "\n"); + expectCanParseScalar("'\r\r'", "\n"); + expectCanParseScalar("'\n\r'", "\n"); + expectCanParseScalar("'\r\n\r\n'", "\n"); + expectCanParseScalar("'\n\n\n'", "\n\n"); + expectCanParseScalar("'\r\r\r'", "\n\n"); + expectCanParseScalar("'\r\n\r\n\r\n'", "\n\n"); + expectCanParseScalar("' \t \t \n\t \t \t\r \t \t '", "\n"); + expectCanParseScalar("' \t A \t \n \t B \t '", " \t A B \t "); + expectCanParseScalar("'A\nB'", "A B"); + expectCanParseScalar("'A\rB'", "A B"); + expectCanParseScalar("'A\r\nB'", "A B"); + expectCanParseScalar("'A\n\nB'", "A\nB"); + expectCanParseScalar("'A\r\rB'", "A\nB"); + expectCanParseScalar("'A\n\rB'", "A\nB"); + expectCanParseScalar("'A\r\n\r\nB'", "A\nB"); + expectCanParseScalar("'A\n\n\nB'", "A\n\nB"); + expectCanParseScalar("'A\r\r\rB'", "A\n\nB"); + expectCanParseScalar("'A\r\n\r\n\r\nB'", "A\n\nB"); + expectCanParseScalar("'A \t \t \n\t \t \t B'", "A B"); + expectCanParseScalar("'A \t \t \n\t \t \t\r \t \t B'", "A\nB"); + expectCanParseScalar("'A \t \t \n\t \t \t\r\n \t \r \t B'", "A\n\nB"); + // Plain values + expectCanParseScalar("A \t \r \n \t \r\n \t\r\r\t ", "A"); + expectCanParseScalar("A \t \n \t B", "A B"); + expectCanParseScalar("A\nB", "A B"); + expectCanParseScalar("A\rB", "A B"); + expectCanParseScalar("A\r\nB", "A B"); + expectCanParseScalar("A\n\nB", "A\nB"); + expectCanParseScalar("A\r\rB", "A\nB"); + expectCanParseScalar("A\n\rB", "A\nB"); + expectCanParseScalar("A\r\n\r\nB", "A\nB"); + expectCanParseScalar("A\n\n\nB", "A\n\nB"); + expectCanParseScalar("A\r\r\rB", "A\n\nB"); + expectCanParseScalar("A\r\n\r\n\r\nB", "A\n\nB"); + expectCanParseScalar("A \t \t \n\t \t \t B", "A B"); + expectCanParseScalar("A \t \t \n\t \t \t\r \t \t B", "A\nB"); + expectCanParseScalar("A \t \t \n\t \t \t\r\n \t \r \t B", "A\n\nB"); +} + } // end namespace llvm