[YAMLParser] Unfold multi-line scalar values (#70898)
Long scalar values can be split into multiple lines to improve readability. The rules are described in Section 6.5. "Line Folding", https://yaml.org/spec/1.2.2/#65-line-folding. In addition, for flow scalar styles, the Spec states that "All leading and trailing white space characters on each line are excluded from the content", https://yaml.org/spec/1.2.2/#73-flow-scalar-styles. The patch implements these unfolding rules for double-quoted, single-quoted, and plain scalars.
This commit is contained in:
@@ -240,9 +240,14 @@ public:
|
||||
private:
|
||||
StringRef Value;
|
||||
|
||||
StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
|
||||
StringRef::size_type Start,
|
||||
StringRef getDoubleQuotedValue(StringRef UnquotedValue,
|
||||
SmallVectorImpl<char> &Storage) const;
|
||||
|
||||
static StringRef getSingleQuotedValue(StringRef RawValue,
|
||||
SmallVectorImpl<char> &Storage);
|
||||
|
||||
static StringRef getPlainValue(StringRef RawValue,
|
||||
SmallVectorImpl<char> &Storage);
|
||||
};
|
||||
|
||||
/// A block scalar node is an opaque datum that can be presented as a
|
||||
|
||||
@@ -2030,186 +2030,231 @@ bool Node::failed() const {
|
||||
}
|
||||
|
||||
StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
|
||||
// TODO: Handle newlines properly. We need to remove leading whitespace.
|
||||
if (Value[0] == '"') { // Double quoted.
|
||||
// Pull off the leading and trailing "s.
|
||||
StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
|
||||
// Search for characters that would require unescaping the value.
|
||||
StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
|
||||
if (i != StringRef::npos)
|
||||
return unescapeDoubleQuoted(UnquotedValue, i, Storage);
|
||||
return UnquotedValue;
|
||||
} else if (Value[0] == '\'') { // Single quoted.
|
||||
// Pull off the leading and trailing 's.
|
||||
StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
|
||||
StringRef::size_type i = UnquotedValue.find('\'');
|
||||
if (i != StringRef::npos) {
|
||||
// We're going to need Storage.
|
||||
Storage.clear();
|
||||
Storage.reserve(UnquotedValue.size());
|
||||
for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
|
||||
StringRef Valid(UnquotedValue.begin(), i);
|
||||
llvm::append_range(Storage, Valid);
|
||||
Storage.push_back('\'');
|
||||
UnquotedValue = UnquotedValue.substr(i + 2);
|
||||
}
|
||||
llvm::append_range(Storage, UnquotedValue);
|
||||
return StringRef(Storage.begin(), Storage.size());
|
||||
}
|
||||
return UnquotedValue;
|
||||
}
|
||||
// Plain.
|
||||
// Trim whitespace ('b-char' and 's-white').
|
||||
// NOTE: Alternatively we could change the scanner to not include whitespace
|
||||
// here in the first place.
|
||||
return Value.rtrim("\x0A\x0D\x20\x09");
|
||||
if (Value[0] == '"')
|
||||
return getDoubleQuotedValue(Value, Storage);
|
||||
if (Value[0] == '\'')
|
||||
return getSingleQuotedValue(Value, Storage);
|
||||
return getPlainValue(Value, Storage);
|
||||
}
|
||||
|
||||
StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
|
||||
, StringRef::size_type i
|
||||
, SmallVectorImpl<char> &Storage)
|
||||
const {
|
||||
// Use Storage to build proper value.
|
||||
/// parseScalarValue - A common parsing routine for all flow scalar styles.
|
||||
/// It handles line break characters by itself, adds regular content characters
|
||||
/// to the result, and forwards escaped sequences to the provided routine for
|
||||
/// the style-specific processing.
|
||||
///
|
||||
/// \param UnquotedValue - An input value without quotation marks.
|
||||
/// \param Storage - A storage for the result if the input value is multiline or
|
||||
/// contains escaped characters.
|
||||
/// \param LookupChars - A set of special characters to search in the input
|
||||
/// string. Should include line break characters and the escape character
|
||||
/// specific for the processing scalar style, if any.
|
||||
/// \param UnescapeCallback - This is called when the escape character is found
|
||||
/// in the input.
|
||||
/// \returns - The unfolded and unescaped value.
|
||||
static StringRef
|
||||
parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage,
|
||||
StringRef LookupChars,
|
||||
std::function<StringRef(StringRef, SmallVectorImpl<char> &)>
|
||||
UnescapeCallback) {
|
||||
size_t I = UnquotedValue.find_first_of(LookupChars);
|
||||
if (I == StringRef::npos)
|
||||
return UnquotedValue;
|
||||
|
||||
Storage.clear();
|
||||
Storage.reserve(UnquotedValue.size());
|
||||
for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
|
||||
// Insert all previous chars into Storage.
|
||||
StringRef Valid(UnquotedValue.begin(), i);
|
||||
llvm::append_range(Storage, Valid);
|
||||
// Chop off inserted chars.
|
||||
UnquotedValue = UnquotedValue.substr(i);
|
||||
|
||||
assert(!UnquotedValue.empty() && "Can't be empty!");
|
||||
|
||||
// Parse escape or line break.
|
||||
switch (UnquotedValue[0]) {
|
||||
case '\r':
|
||||
case '\n':
|
||||
Storage.push_back('\n');
|
||||
if ( UnquotedValue.size() > 1
|
||||
&& (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
|
||||
UnquotedValue = UnquotedValue.substr(1);
|
||||
UnquotedValue = UnquotedValue.substr(1);
|
||||
break;
|
||||
default:
|
||||
if (UnquotedValue.size() == 1) {
|
||||
Token T;
|
||||
T.Range = StringRef(UnquotedValue.begin(), 1);
|
||||
setError("Unrecognized escape code", T);
|
||||
return "";
|
||||
}
|
||||
UnquotedValue = UnquotedValue.substr(1);
|
||||
switch (UnquotedValue[0]) {
|
||||
default: {
|
||||
Token T;
|
||||
T.Range = StringRef(UnquotedValue.begin(), 1);
|
||||
setError("Unrecognized escape code", T);
|
||||
return "";
|
||||
}
|
||||
case '\r':
|
||||
// Shrink the Windows-style EOL.
|
||||
if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n')
|
||||
UnquotedValue = UnquotedValue.drop_front(1);
|
||||
[[fallthrough]];
|
||||
case '\n':
|
||||
UnquotedValue = UnquotedValue.drop_front(1).ltrim(" \t");
|
||||
continue;
|
||||
case '0':
|
||||
Storage.push_back(0x00);
|
||||
break;
|
||||
case 'a':
|
||||
Storage.push_back(0x07);
|
||||
break;
|
||||
case 'b':
|
||||
Storage.push_back(0x08);
|
||||
break;
|
||||
case 't':
|
||||
case 0x09:
|
||||
Storage.push_back(0x09);
|
||||
break;
|
||||
case 'n':
|
||||
Storage.push_back(0x0A);
|
||||
break;
|
||||
case 'v':
|
||||
Storage.push_back(0x0B);
|
||||
break;
|
||||
case 'f':
|
||||
Storage.push_back(0x0C);
|
||||
break;
|
||||
case 'r':
|
||||
Storage.push_back(0x0D);
|
||||
break;
|
||||
case 'e':
|
||||
Storage.push_back(0x1B);
|
||||
break;
|
||||
case ' ':
|
||||
Storage.push_back(0x20);
|
||||
break;
|
||||
case '"':
|
||||
Storage.push_back(0x22);
|
||||
break;
|
||||
case '/':
|
||||
Storage.push_back(0x2F);
|
||||
break;
|
||||
case '\\':
|
||||
Storage.push_back(0x5C);
|
||||
break;
|
||||
case 'N':
|
||||
encodeUTF8(0x85, Storage);
|
||||
break;
|
||||
case '_':
|
||||
encodeUTF8(0xA0, Storage);
|
||||
break;
|
||||
case 'L':
|
||||
encodeUTF8(0x2028, Storage);
|
||||
break;
|
||||
case 'P':
|
||||
encodeUTF8(0x2029, Storage);
|
||||
break;
|
||||
case 'x': {
|
||||
if (UnquotedValue.size() < 3)
|
||||
// TODO: Report error.
|
||||
break;
|
||||
unsigned int UnicodeScalarValue;
|
||||
if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
|
||||
// TODO: Report error.
|
||||
UnicodeScalarValue = 0xFFFD;
|
||||
encodeUTF8(UnicodeScalarValue, Storage);
|
||||
UnquotedValue = UnquotedValue.substr(2);
|
||||
break;
|
||||
}
|
||||
case 'u': {
|
||||
if (UnquotedValue.size() < 5)
|
||||
// TODO: Report error.
|
||||
break;
|
||||
unsigned int UnicodeScalarValue;
|
||||
if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
|
||||
// TODO: Report error.
|
||||
UnicodeScalarValue = 0xFFFD;
|
||||
encodeUTF8(UnicodeScalarValue, Storage);
|
||||
UnquotedValue = UnquotedValue.substr(4);
|
||||
break;
|
||||
}
|
||||
case 'U': {
|
||||
if (UnquotedValue.size() < 9)
|
||||
// TODO: Report error.
|
||||
break;
|
||||
unsigned int UnicodeScalarValue;
|
||||
if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
|
||||
// TODO: Report error.
|
||||
UnicodeScalarValue = 0xFFFD;
|
||||
encodeUTF8(UnicodeScalarValue, Storage);
|
||||
UnquotedValue = UnquotedValue.substr(8);
|
||||
break;
|
||||
}
|
||||
}
|
||||
UnquotedValue = UnquotedValue.substr(1);
|
||||
char LastNewLineAddedAs = '\0';
|
||||
for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) {
|
||||
if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') {
|
||||
llvm::append_range(Storage, UnquotedValue.take_front(I));
|
||||
UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage);
|
||||
LastNewLineAddedAs = '\0';
|
||||
continue;
|
||||
}
|
||||
if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I);
|
||||
LastNonSWhite != StringRef::npos) {
|
||||
llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1));
|
||||
Storage.push_back(' ');
|
||||
LastNewLineAddedAs = ' ';
|
||||
} else {
|
||||
// Note: we can't just check if the last character in Storage is ' ',
|
||||
// '\n', or something else; that would give a wrong result for double
|
||||
// quoted values containing an escaped space character before a new-line
|
||||
// character.
|
||||
switch (LastNewLineAddedAs) {
|
||||
case ' ':
|
||||
assert(!Storage.empty() && Storage.back() == ' ');
|
||||
Storage.back() = '\n';
|
||||
LastNewLineAddedAs = '\n';
|
||||
break;
|
||||
case '\n':
|
||||
assert(!Storage.empty() && Storage.back() == '\n');
|
||||
Storage.push_back('\n');
|
||||
break;
|
||||
default:
|
||||
Storage.push_back(' ');
|
||||
LastNewLineAddedAs = ' ';
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Handle Windows-style EOL
|
||||
if (UnquotedValue.substr(I, 2) == "\r\n")
|
||||
I++;
|
||||
UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t");
|
||||
}
|
||||
llvm::append_range(Storage, UnquotedValue);
|
||||
return StringRef(Storage.begin(), Storage.size());
|
||||
}
|
||||
|
||||
StringRef
|
||||
ScalarNode::getDoubleQuotedValue(StringRef RawValue,
|
||||
SmallVectorImpl<char> &Storage) const {
|
||||
assert(RawValue.size() >= 2 && RawValue.front() == '"' &&
|
||||
RawValue.back() == '"');
|
||||
StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
|
||||
|
||||
auto UnescapeFunc = [this](StringRef UnquotedValue,
|
||||
SmallVectorImpl<char> &Storage) {
|
||||
assert(UnquotedValue.take_front(1) == "\\");
|
||||
if (UnquotedValue.size() == 1) {
|
||||
Token T;
|
||||
T.Range = UnquotedValue;
|
||||
setError("Unrecognized escape code", T);
|
||||
Storage.clear();
|
||||
return StringRef();
|
||||
}
|
||||
UnquotedValue = UnquotedValue.drop_front(1);
|
||||
switch (UnquotedValue[0]) {
|
||||
default: {
|
||||
Token T;
|
||||
T.Range = UnquotedValue.take_front(1);
|
||||
setError("Unrecognized escape code", T);
|
||||
Storage.clear();
|
||||
return StringRef();
|
||||
}
|
||||
case '\r':
|
||||
// Shrink the Windows-style EOL.
|
||||
if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n')
|
||||
UnquotedValue = UnquotedValue.drop_front(1);
|
||||
[[fallthrough]];
|
||||
case '\n':
|
||||
return UnquotedValue.drop_front(1).ltrim(" \t");
|
||||
case '0':
|
||||
Storage.push_back(0x00);
|
||||
break;
|
||||
case 'a':
|
||||
Storage.push_back(0x07);
|
||||
break;
|
||||
case 'b':
|
||||
Storage.push_back(0x08);
|
||||
break;
|
||||
case 't':
|
||||
case 0x09:
|
||||
Storage.push_back(0x09);
|
||||
break;
|
||||
case 'n':
|
||||
Storage.push_back(0x0A);
|
||||
break;
|
||||
case 'v':
|
||||
Storage.push_back(0x0B);
|
||||
break;
|
||||
case 'f':
|
||||
Storage.push_back(0x0C);
|
||||
break;
|
||||
case 'r':
|
||||
Storage.push_back(0x0D);
|
||||
break;
|
||||
case 'e':
|
||||
Storage.push_back(0x1B);
|
||||
break;
|
||||
case ' ':
|
||||
Storage.push_back(0x20);
|
||||
break;
|
||||
case '"':
|
||||
Storage.push_back(0x22);
|
||||
break;
|
||||
case '/':
|
||||
Storage.push_back(0x2F);
|
||||
break;
|
||||
case '\\':
|
||||
Storage.push_back(0x5C);
|
||||
break;
|
||||
case 'N':
|
||||
encodeUTF8(0x85, Storage);
|
||||
break;
|
||||
case '_':
|
||||
encodeUTF8(0xA0, Storage);
|
||||
break;
|
||||
case 'L':
|
||||
encodeUTF8(0x2028, Storage);
|
||||
break;
|
||||
case 'P':
|
||||
encodeUTF8(0x2029, Storage);
|
||||
break;
|
||||
case 'x': {
|
||||
if (UnquotedValue.size() < 3)
|
||||
// TODO: Report error.
|
||||
break;
|
||||
unsigned int UnicodeScalarValue;
|
||||
if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
|
||||
// TODO: Report error.
|
||||
UnicodeScalarValue = 0xFFFD;
|
||||
encodeUTF8(UnicodeScalarValue, Storage);
|
||||
return UnquotedValue.drop_front(3);
|
||||
}
|
||||
case 'u': {
|
||||
if (UnquotedValue.size() < 5)
|
||||
// TODO: Report error.
|
||||
break;
|
||||
unsigned int UnicodeScalarValue;
|
||||
if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
|
||||
// TODO: Report error.
|
||||
UnicodeScalarValue = 0xFFFD;
|
||||
encodeUTF8(UnicodeScalarValue, Storage);
|
||||
return UnquotedValue.drop_front(5);
|
||||
}
|
||||
case 'U': {
|
||||
if (UnquotedValue.size() < 9)
|
||||
// TODO: Report error.
|
||||
break;
|
||||
unsigned int UnicodeScalarValue;
|
||||
if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
|
||||
// TODO: Report error.
|
||||
UnicodeScalarValue = 0xFFFD;
|
||||
encodeUTF8(UnicodeScalarValue, Storage);
|
||||
return UnquotedValue.drop_front(9);
|
||||
}
|
||||
}
|
||||
return UnquotedValue.drop_front(1);
|
||||
};
|
||||
|
||||
return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc);
|
||||
}
|
||||
|
||||
StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue,
|
||||
SmallVectorImpl<char> &Storage) {
|
||||
assert(RawValue.size() >= 2 && RawValue.front() == '\'' &&
|
||||
RawValue.back() == '\'');
|
||||
StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
|
||||
|
||||
auto UnescapeFunc = [](StringRef UnquotedValue,
|
||||
SmallVectorImpl<char> &Storage) {
|
||||
assert(UnquotedValue.take_front(2) == "''");
|
||||
Storage.push_back('\'');
|
||||
return UnquotedValue.drop_front(2);
|
||||
};
|
||||
|
||||
return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc);
|
||||
}
|
||||
|
||||
StringRef ScalarNode::getPlainValue(StringRef RawValue,
|
||||
SmallVectorImpl<char> &Storage) {
|
||||
// Trim trailing whitespace ('b-char' and 's-white').
|
||||
// NOTE: Alternatively we could change the scanner to not include whitespace
|
||||
// here in the first place.
|
||||
RawValue = RawValue.rtrim("\r\n \t");
|
||||
return parseScalarValue(RawValue, Storage, "\r\n", nullptr);
|
||||
}
|
||||
|
||||
Node *KeyValueNode::getKey() {
|
||||
if (Key)
|
||||
return Key;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "Text containing \n both space and\t\n \ttab\tcharacters"
|
||||
# CHECK: "Text containing both space and tab\tcharacters"
|
||||
|
||||
"Text containing
|
||||
both space and
|
||||
|
||||
@@ -6,4 +6,4 @@
|
||||
\ \_ \N \L \P \
|
||||
\x41 \u0041 \U00000041"
|
||||
|
||||
# CHECK: !!str "Fun with \\\n\" \a \b \e \f \n \r \t \v \0 \_ \N \L \P A A A"
|
||||
# CHECK: !!str "Fun with \\ \" \a \b \e \f \n \r \t \v \0 \_ \N \L \P A A A"
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
# CHECK-NEXT: : !!map {
|
||||
# CHECK-NEXT: ? !!str "also simple"
|
||||
# CHECK-NEXT: : !!str "value",
|
||||
# CHECK-NEXT: ? !!str "not a\n simple key"
|
||||
# CHECK-NEXT: : !!str "any\n value",
|
||||
# CHECK-NEXT: ? !!str "not a simple key"
|
||||
# CHECK-NEXT: : !!str "any value",
|
||||
# CHECK-NEXT: },
|
||||
# CHECK-NEXT: }
|
||||
|
||||
|
||||
@@ -1,12 +1,24 @@
|
||||
# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s --strict-whitespace
|
||||
# CHECK: "as space\n trimmed \n specific\L\n escaped\t\n none"
|
||||
# CHECK: "as space trimmed\nspecific\L escaped\t none"
|
||||
|
||||
## Note: The example was originally taken from Spec 1.1, but the parsing rules
|
||||
## have been changed since then.
|
||||
## * The paragraph-separator character '\u2029' is excluded from line-break
|
||||
## * The line-separator character '\u2028' is no longer considered a line-break
|
||||
## character, so the line "...specific\u2028\nescaped..." is now parsed as
|
||||
## "...specific\L escaped...".
|
||||
## * The paragraph-separator character '\u2029' is also excluded from line-break
|
||||
## characters, so the original sequence "escaped\t\\\u2029" is no longer
|
||||
## considered valid. This is replaced by "escaped\t\\\n" in the test source.
|
||||
## considered valid. This is replaced by "escaped\t\\\n" in the test source,
|
||||
# so the output has changed as well.
|
||||
## See https://yaml.org/spec/1.2.2/ext/changes/ for details.
|
||||
##
|
||||
## Note 2: Different parsers handle this corner case example differently.
|
||||
## * https://github.com/yaml/libyaml:
|
||||
## "as space trimmed\nspecific\L\nescaped\t\nnone"
|
||||
## * https://github.com/yaml/yaml-reference-parser (parser-1.2):
|
||||
## "as space trimmed\nspecific\L escaped\t none"
|
||||
## * https://github.com/yaml/yaml-reference-parser (parser-1.3):
|
||||
## "as space trimmed\nspecific
escaped\t none"
|
||||
|
||||
"as space
|
||||
trimmed
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: !!seq [
|
||||
# CHECK-NEXT: !!str "\n last",
|
||||
# CHECK-NEXT: !!str " \t\n last",
|
||||
# CHECK-NEXT: !!str " \tfirst\n last",
|
||||
# CHECK-NEXT: !!str " last",
|
||||
# CHECK-NEXT: !!str " last",
|
||||
# CHECK-NEXT: !!str " \tfirst last",
|
||||
# CHECK-NEXT: ]
|
||||
|
||||
- "
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "first\n \tinner 1\t\n inner 2 last"
|
||||
# CHECK: "first inner 1 inner 2 last"
|
||||
|
||||
"first
|
||||
inner 1
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: !!seq [
|
||||
# CHECK-NEXT: !!str "first\n \t",
|
||||
# CHECK-NEXT: !!str "first\n \tlast",
|
||||
# CHECK-NEXT: !!str "first\n inner\n \tlast",
|
||||
# CHECK-NEXT: !!str "first ",
|
||||
# CHECK-NEXT: !!str "first\nlast",
|
||||
# CHECK-NEXT: !!str "first inner \tlast",
|
||||
# CHECK-NEXT: ]
|
||||
|
||||
- "first
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
# CHECK-NEXT: : !!map {
|
||||
# CHECK-NEXT: ? !!str "also simple"
|
||||
# CHECK-NEXT: : !!str "value",
|
||||
# CHECK-NEXT: ? !!str "not a\n simple key"
|
||||
# CHECK-NEXT: : !!str "any\n value",
|
||||
# CHECK-NEXT: ? !!str "not a simple key"
|
||||
# CHECK-NEXT: : !!str "any value",
|
||||
# CHECK-NEXT: },
|
||||
# CHECK-NEXT: }
|
||||
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "as space\t\n trimmed \n \n specific\L\n none"
|
||||
# CHECK: "as space trimmed\nspecific\L none"
|
||||
|
||||
## Note: The parsing rules were changed in version 1.2 and the line-separator
|
||||
## character is no longer considered a line-break character. The example is
|
||||
## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as
|
||||
## in the original edition.
|
||||
## See https://yaml.org/spec/1.2.2/ext/changes/ for details.
|
||||
|
||||
'as space
|
||||
trimmed
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: !!seq [
|
||||
# CHECK-NEXT: !!str "\n last",
|
||||
# CHECK-NEXT: !!str " \t\n last",
|
||||
# CHECK-NEXT: !!str " \tfirst\n last",
|
||||
# CHECK-NEXT: !!str " last",
|
||||
# CHECK-NEXT: !!str " last",
|
||||
# CHECK-NEXT: !!str " \tfirst last",
|
||||
# CHECK-NEXT: ]
|
||||
|
||||
- '
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "first\n \tinner\t\n last"
|
||||
# CHECK: "first inner last"
|
||||
|
||||
'first
|
||||
inner
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: !!seq [
|
||||
# CHECK-NEXT: !!str "first\n \t",
|
||||
# CHECK-NEXT: !!str "first\n\n \tlast",
|
||||
# CHECK-NEXT: !!str "first ",
|
||||
# CHECK-NEXT: !!str "first\nlast",
|
||||
# CHECK-NEXT: ]
|
||||
|
||||
- 'first
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
# CHECK-NEXT: : !!map {
|
||||
# CHECK-NEXT: ? !!str "also simple"
|
||||
# CHECK-NEXT: : !!str "value",
|
||||
# CHECK-NEXT: ? !!str "not a\n simple key"
|
||||
# CHECK-NEXT: : !!str "any\n value",
|
||||
# CHECK-NEXT: ? !!str "not a simple key"
|
||||
# CHECK-NEXT: : !!str "any value",
|
||||
# CHECK-NEXT: },
|
||||
# CHECK-NEXT: }
|
||||
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "as space\t\n trimmed \n\n specific\L\n none"
|
||||
# CHECK: "as space trimmed\nspecific\L none"
|
||||
|
||||
## Note: The parsing rules were changed in version 1.2 and the line-separator
|
||||
## character is no longer considered a line-break character. The example is
|
||||
## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as
|
||||
## in the original edition.
|
||||
## See https://yaml.org/spec/1.2.2/ext/changes/ for details.
|
||||
|
||||
as space
|
||||
trimmed
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "first line \n \n more line"
|
||||
# CHECK: "first line\nmore line"
|
||||
|
||||
first line
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: !!seq [
|
||||
# CHECK-NEXT: !!str "double\n quoted",
|
||||
# CHECK-NEXT: !!str "single\n quoted",
|
||||
# CHECK-NEXT: !!str "plain\n text",
|
||||
# CHECK-NEXT: !!str "double quoted",
|
||||
# CHECK-NEXT: !!str "single quoted",
|
||||
# CHECK-NEXT: !!str "plain text",
|
||||
# CHECK-NEXT: !!seq [
|
||||
# CHECK-NEXT: !!str "nested",
|
||||
# CHECK-NEXT: ],
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "folded \nto a space,\t\n \nto a line feed, or \t \tnon-content"
|
||||
# CHECK: "folded to a space,\nto a line feed, or \t \tnon-content"
|
||||
|
||||
"folded
|
||||
to a space,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: " 1st non-empty\n 2nd non-empty \n\t3rd non-empty "
|
||||
# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty "
|
||||
|
||||
" 1st non-empty
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: " 1st non-empty\n\n 2nd non-empty \n\t3rd non-empty "
|
||||
# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty "
|
||||
|
||||
' 1st non-empty
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace
|
||||
# CHECK: "1st non-empty\n\n 2nd non-empty \n\t3rd non-empty"
|
||||
# CHECK: "1st non-empty\n2nd non-empty 3rd non-empty"
|
||||
|
||||
1st non-empty
|
||||
|
||||
|
||||
@@ -441,4 +441,106 @@ TEST(YAMLParser, ParsesBools) {
|
||||
expectCannotParseBool("0");
|
||||
}
|
||||
|
||||
// Checks that the given string can be parsed into an expected scalar value.
|
||||
static void expectCanParseScalar(StringRef Input, StringRef Expected) {
|
||||
SourceMgr SM;
|
||||
yaml::Stream Stream(Input, SM);
|
||||
yaml::Node *Root = Stream.begin()->getRoot();
|
||||
ASSERT_NE(Root, nullptr);
|
||||
auto *ScalarNode = dyn_cast<yaml::ScalarNode>(Root);
|
||||
ASSERT_NE(ScalarNode, nullptr);
|
||||
SmallVector<char> Storage;
|
||||
StringRef Result = ScalarNode->getValue(Storage);
|
||||
EXPECT_EQ(Result, Expected);
|
||||
}
|
||||
|
||||
TEST(YAMLParser, UnfoldsScalarValue) {
|
||||
// Double-quoted values
|
||||
expectCanParseScalar("\"\"", "");
|
||||
expectCanParseScalar("\" \t\t \t\t \"", " \t\t \t\t ");
|
||||
expectCanParseScalar("\"\n\"", " ");
|
||||
expectCanParseScalar("\"\r\"", " ");
|
||||
expectCanParseScalar("\"\r\n\"", " ");
|
||||
expectCanParseScalar("\"\n\n\"", "\n");
|
||||
expectCanParseScalar("\"\r\r\"", "\n");
|
||||
expectCanParseScalar("\"\n\r\"", "\n");
|
||||
expectCanParseScalar("\"\r\n\r\n\"", "\n");
|
||||
expectCanParseScalar("\"\n\n\n\"", "\n\n");
|
||||
expectCanParseScalar("\"\r\r\r\"", "\n\n");
|
||||
expectCanParseScalar("\"\r\n\r\n\r\n\"", "\n\n");
|
||||
expectCanParseScalar("\" \t \t \n\t \t \t\r \t \t \"", "\n");
|
||||
expectCanParseScalar("\" \t A \t \n \t B \t \"", " \t A B \t ");
|
||||
expectCanParseScalar("\" \t \\ \r\r\t \\ \t \"", " \t \n \t ");
|
||||
expectCanParseScalar("\"A\nB\"", "A B");
|
||||
expectCanParseScalar("\"A\rB\"", "A B");
|
||||
expectCanParseScalar("\"A\r\nB\"", "A B");
|
||||
expectCanParseScalar("\"A\n\nB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\r\rB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\n\rB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\r\n\r\nB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\n\n\nB\"", "A\n\nB");
|
||||
expectCanParseScalar("\"A\r\r\rB\"", "A\n\nB");
|
||||
expectCanParseScalar("\"A\r\n\r\n\r\nB\"", "A\n\nB");
|
||||
expectCanParseScalar("\"A \t \t \n\t \t \t B\"", "A B");
|
||||
expectCanParseScalar("\"A \t \t \n\t \t \t\r \t \t B\"", "A\nB");
|
||||
expectCanParseScalar("\"A \t \t \n\t \t \t\r\n \t \r \t B\"", "A\n\nB");
|
||||
expectCanParseScalar("\"A\\\rB\"", "AB");
|
||||
expectCanParseScalar("\"A\\\nB\"", "AB");
|
||||
expectCanParseScalar("\"A\\\r\nB\"", "AB");
|
||||
expectCanParseScalar("\"A \t \\\rB\"", "A \t B");
|
||||
expectCanParseScalar("\"A \t\\\nB\"", "A \tB");
|
||||
expectCanParseScalar("\"A\t \\\r\nB\"", "A\t B");
|
||||
expectCanParseScalar("\"A\\\r\rB\"", "A B");
|
||||
expectCanParseScalar("\"A\\\n\nB\"", "A B");
|
||||
expectCanParseScalar("\"A\\\r\n\r\nB\"", "A B");
|
||||
expectCanParseScalar("\"A\\\r\r\rB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\\\n\n\nB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\\\r\n\r\n\r\nB\"", "A\nB");
|
||||
expectCanParseScalar("\"A\r\\ \rB\"", "A B");
|
||||
// Single-quoted values
|
||||
expectCanParseScalar("''", "");
|
||||
expectCanParseScalar("' \t\t \t\t '", " \t\t \t\t ");
|
||||
expectCanParseScalar("'\n'", " ");
|
||||
expectCanParseScalar("'\r'", " ");
|
||||
expectCanParseScalar("'\r\n'", " ");
|
||||
expectCanParseScalar("'\n\n'", "\n");
|
||||
expectCanParseScalar("'\r\r'", "\n");
|
||||
expectCanParseScalar("'\n\r'", "\n");
|
||||
expectCanParseScalar("'\r\n\r\n'", "\n");
|
||||
expectCanParseScalar("'\n\n\n'", "\n\n");
|
||||
expectCanParseScalar("'\r\r\r'", "\n\n");
|
||||
expectCanParseScalar("'\r\n\r\n\r\n'", "\n\n");
|
||||
expectCanParseScalar("' \t \t \n\t \t \t\r \t \t '", "\n");
|
||||
expectCanParseScalar("' \t A \t \n \t B \t '", " \t A B \t ");
|
||||
expectCanParseScalar("'A\nB'", "A B");
|
||||
expectCanParseScalar("'A\rB'", "A B");
|
||||
expectCanParseScalar("'A\r\nB'", "A B");
|
||||
expectCanParseScalar("'A\n\nB'", "A\nB");
|
||||
expectCanParseScalar("'A\r\rB'", "A\nB");
|
||||
expectCanParseScalar("'A\n\rB'", "A\nB");
|
||||
expectCanParseScalar("'A\r\n\r\nB'", "A\nB");
|
||||
expectCanParseScalar("'A\n\n\nB'", "A\n\nB");
|
||||
expectCanParseScalar("'A\r\r\rB'", "A\n\nB");
|
||||
expectCanParseScalar("'A\r\n\r\n\r\nB'", "A\n\nB");
|
||||
expectCanParseScalar("'A \t \t \n\t \t \t B'", "A B");
|
||||
expectCanParseScalar("'A \t \t \n\t \t \t\r \t \t B'", "A\nB");
|
||||
expectCanParseScalar("'A \t \t \n\t \t \t\r\n \t \r \t B'", "A\n\nB");
|
||||
// Plain values
|
||||
expectCanParseScalar("A \t \r \n \t \r\n \t\r\r\t ", "A");
|
||||
expectCanParseScalar("A \t \n \t B", "A B");
|
||||
expectCanParseScalar("A\nB", "A B");
|
||||
expectCanParseScalar("A\rB", "A B");
|
||||
expectCanParseScalar("A\r\nB", "A B");
|
||||
expectCanParseScalar("A\n\nB", "A\nB");
|
||||
expectCanParseScalar("A\r\rB", "A\nB");
|
||||
expectCanParseScalar("A\n\rB", "A\nB");
|
||||
expectCanParseScalar("A\r\n\r\nB", "A\nB");
|
||||
expectCanParseScalar("A\n\n\nB", "A\n\nB");
|
||||
expectCanParseScalar("A\r\r\rB", "A\n\nB");
|
||||
expectCanParseScalar("A\r\n\r\n\r\nB", "A\n\nB");
|
||||
expectCanParseScalar("A \t \t \n\t \t \t B", "A B");
|
||||
expectCanParseScalar("A \t \t \n\t \t \t\r \t \t B", "A\nB");
|
||||
expectCanParseScalar("A \t \t \n\t \t \t\r\n \t \r \t B", "A\n\nB");
|
||||
}
|
||||
|
||||
} // end namespace llvm
|
||||
|
||||
Reference in New Issue
Block a user