[pseudo] Start rules are _ := start-symbol EOF, improve recovery.

Previously we were calling glrRecover() ad-hoc at the end of input.
Two main problems with this:
 - glrRecover() on two separate code paths is inelegant
 - We may have to recover several times in succession (e.g. to exit from
   nested scopes), so we need a loop at end-of-file
Having an actual shift action for an EOF terminal allows us to handle
both concerns in the main shift/recover/reduce loop.

This revealed a recovery design bug where recovery could enter a loop by
repeatedly choosing the same parent to identically recover from.
Addressed this by allowing each node to be used as a recovery base once.

Differential Revision: https://reviews.llvm.org/D130550
This commit is contained in:
Sam McCall
2022-08-19 15:53:50 +02:00
parent 4d931b6e1e
commit bd5cc6575b
9 changed files with 122 additions and 73 deletions

View File

@@ -71,6 +71,8 @@ struct GSS {
LRTable::StateID State;
// Used internally to track reachability during garbage collection.
bool GCParity;
// Have we already used this node for error recovery? (prevents loops)
mutable bool Recovered = false;
// Number of the parents of this node.
// The parents hold previous parsed symbols, and may resume control after
// this node is reduced.

View File

@@ -178,7 +178,7 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
llvm::ArrayRef<ForestNode>
ForestArena::createTerminals(const TokenStream &Code) {
ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size());
ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size() + 1);
size_t Index = 0;
for (const auto &T : Code.tokens()) {
new (&Terminals[Index])
@@ -186,6 +186,12 @@ ForestArena::createTerminals(const TokenStream &Code) {
/*Start=*/Index, /*TerminalData*/ 0);
++Index;
}
// Include an `eof` terminal.
// This is important to drive the final shift/recover/reduce loop.
new (&Terminals[Index])
ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof),
/*Start=*/Index, /*TerminalData*/ 0);
++Index;
NodeCount = Index;
return llvm::makeArrayRef(Terminals, Index);
}

View File

@@ -95,17 +95,19 @@ void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
if (!Seen.insert(N).second)
return;
for (auto Strategy : Lang.Table.getRecovery(N->State)) {
Options.push_back(PlaceholderRecovery{
NextTok,
Strategy.Result,
Strategy.Strategy,
N,
Path,
});
LLVM_DEBUG(llvm::dbgs()
<< "Option: recover " << Lang.G.symbolName(Strategy.Result)
<< " at token " << NextTok << "\n");
if (!N->Recovered) { // Don't recover the same way twice!
for (auto Strategy : Lang.Table.getRecovery(N->State)) {
Options.push_back(PlaceholderRecovery{
NextTok,
Strategy.Result,
Strategy.Strategy,
N,
Path,
});
LLVM_DEBUG(llvm::dbgs()
<< "Option: recover " << Lang.G.symbolName(Strategy.Result)
<< " at token " << NextTok << "\n");
}
}
Path.push_back(N->Payload);
for (const GSS::Node *Parent : N->parents())
@@ -180,6 +182,7 @@ void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
// There are various options, including simply breaking ties between options.
// For now it's obscure enough to ignore.
for (const PlaceholderRecovery *Option : BestOptions) {
Option->RecoveryNode->Recovered = true;
const ForestNode &Placeholder =
Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
LRTable::StateID OldState = Option->RecoveryNode->State;
@@ -587,6 +590,9 @@ private:
auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
assert(NextState.has_value() && "goto must succeed after reduce!");
Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
LLVM_DEBUG(llvm::dbgs()
<< " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
<< " --> S" << Heads->back()->State << "\n");
return true;
}
};
@@ -638,7 +644,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
// We discard all heads formed by reduction, and recreate them without
// this constraint. This may duplicate some nodes, but it's rare.
LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
"Re-reducing without lookahead.");
"Re-reducing without lookahead.\n");
Heads.resize(HeadsPartition);
Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
@@ -662,34 +668,26 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
}
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
// The parse was successful if we're in state `_ := start-symbol .`
auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol);
assert(AcceptState.has_value() && "goto must succeed after start symbol!");
// The parse was successful if in state `_ := start-symbol EOF .`
// The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol);
assert(AfterStart.has_value() && "goto must succeed after start symbol!");
auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof));
assert(Accept.has_value() && "shift EOF must succeed!");
auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
const ForestNode *Result = nullptr;
for (const auto *Head : Heads) {
if (Head->State == *AcceptState) {
assert(Head->Payload->symbol() == StartSymbol);
if (Head->State == *Accept) {
assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
assert(Result == nullptr && "multiple results!");
Result = Head->Payload;
Result = Head->parents().front()->Payload;
assert(Result->symbol() == StartSymbol);
}
}
return Result;
};
if (auto *Result = SearchForAccept(Heads))
return *Result;
// Failed to parse the input, attempt to run recovery.
// FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
// More elegant is to include EOF in the token stream, and make the
// augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
// would not be a special case: it show up as a failure to shift the EOF
// token.
unsigned I = Terminals.size();
glrRecover(Heads, I, Params, Lang, NextHeads);
Reduce(NextHeads, tokenSymbol(tok::eof));
if (auto *Result = SearchForAccept(NextHeads))
return *Result;
// We failed to parse the input, returning an opaque forest node for recovery.
// FIXME: as above, we can add fallback error handling so this is impossible.
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
@@ -704,8 +702,10 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
llvm::ArrayRef<const Node *> Parents) {
Node *Result = new (allocate(Parents.size()))
Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
Node *Result = new (allocate(Parents.size())) Node();
Result->State = State;
Result->GCParity = GCParity;
Result->ParentCount = Parents.size();
Alive.push_back(Result);
++NodesCreated;
Result->Payload = Symbol;

View File

@@ -29,9 +29,9 @@
# We list important nonterminals as start symbols, rather than doing it for all
# nonterminals by default, this reduces the number of states by 30% and LRTable
# actions by 16%.
_ := translation-unit
_ := statement-seq
_ := declaration-seq
_ := translation-unit EOF
_ := statement-seq EOF
_ := declaration-seq EOF
# gram.key
#! we don't distinguish between namespaces and namespace aliases, as it's hard

View File

@@ -240,8 +240,9 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
PendingStates.push_back(Result.first);
const Rule &StartRule = G.lookupRule(RID);
assert(StartRule.Size == 1 &&
"Start rule must have exactly one symbol in its body!");
assert(StartRule.Size == 2 &&
StartRule.seq().back() == tokenSymbol(tok::eof) &&
"Start rule must be of the form `_ := start-symbol EOF`!");
Builder.addStartState(StartRule.seq().front(), Result.first);
}

View File

@@ -1,19 +1,21 @@
_ := expr
_ := expr EOF
expr := id
id := IDENTIFIER
# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
# GRAPH: States:
# GRAPH-NEXT: State 0
# GRAPH-NEXT: _ := • expr
# GRAPH-NEXT: _ := • expr EOF
# GRAPH-NEXT: expr := • id
# GRAPH-NEXT: id := • IDENTIFIER
# GRAPH-NEXT: State 1
# GRAPH-NEXT: _ := expr •
# GRAPH-NEXT: _ := expr • EOF
# GRAPH-NEXT: State 2
# GRAPH-NEXT: expr := id •
# GRAPH-NEXT: State 3
# GRAPH-NEXT: id := IDENTIFIER •
# GRAPH-NEXT: State 4
# GRAPH-NEXT: _ := expr EOF •
# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
# TABLE: LRTable:
@@ -22,7 +24,9 @@ id := IDENTIFIER
# TABLE-NEXT: expr: go to state 1
# TABLE-NEXT: id: go to state 2
# TABLE-NEXT: State 1
# TABLE-NEXT: EOF: shift state 4
# TABLE-NEXT: State 2
# TABLE-NEXT: EOF: reduce by rule 1 'expr := id'
# TABLE-NEXT: EOF: reduce by rule 2 'expr := id'
# TABLE-NEXT: State 3
# TABLE-NEXT: EOF: reduce by rule 0 'id := IDENTIFIER'
# TABLE-NEXT: EOF: reduce by rule 1 'id := IDENTIFIER'
# TABLE-NEXT: State 4

View File

@@ -1,31 +1,34 @@
_ := expr
_ := expr EOF
expr := expr - expr # S/R conflict at state 4 on '-' token
expr := IDENTIFIER
# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
# GRAPH: States
# GRAPH-NEXT: State 0
# GRAPH-NEXT: _ := • expr EOF
# GRAPH-NEXT: expr := • expr - expr
# GRAPH-NEXT: _ := • expr
# GRAPH-NEXT: expr := • IDENTIFIER
# GRAPH-NEXT: State 1
# GRAPH-NEXT: _ := expr •
# GRAPH-NEXT: _ := expr • EOF
# GRAPH-NEXT: expr := expr • - expr
# GRAPH-NEXT: State 2
# GRAPH-NEXT: expr := IDENTIFIER •
# GRAPH-NEXT: State 3
# GRAPH-NEXT: _ := expr EOF •
# GRAPH-NEXT: State 4
# GRAPH-NEXT: expr := • expr - expr
# GRAPH-NEXT: expr := expr - • expr
# GRAPH-NEXT: expr := • IDENTIFIER
# GRAPH-NEXT: State 4
# GRAPH-NEXT: State 5
# GRAPH-NEXT: expr := expr - expr •
# GRAPH-NEXT: expr := expr • - expr
# GRAPH-NEXT: 0 ->[expr] 1
# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
# GRAPH-NEXT: 1 ->[-] 3
# GRAPH-NEXT: 3 ->[expr] 4
# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
# GRAPH-NEXT: 4 ->[-] 3
# GRAPH-NEXT: 1 ->[EOF] 3
# GRAPH-NEXT: 1 ->[-] 4
# GRAPH-NEXT: 4 ->[expr] 5
# GRAPH-NEXT: 4 ->[IDENTIFIER] 2
# GRAPH-NEXT: 5 ->[-] 4
# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
# TABLE: LRTable:
@@ -33,12 +36,14 @@ expr := IDENTIFIER
# TABLE-NEXT: IDENTIFIER: shift state 2
# TABLE-NEXT: expr: go to state 1
# TABLE-NEXT: State 1
# TABLE-NEXT: -: shift state 3
# TABLE-NEXT: EOF: shift state 3
# TABLE-NEXT: -: shift state 4
# TABLE-NEXT: State 2
# TABLE-NEXT: EOF -: reduce by rule 1 'expr := IDENTIFIER'
# TABLE-NEXT: EOF -: reduce by rule 2 'expr := IDENTIFIER'
# TABLE-NEXT: State 3
# TABLE-NEXT: IDENTIFIER: shift state 2
# TABLE-NEXT: expr: go to state 4
# TABLE-NEXT: State 4
# TABLE-NEXT: -: shift state 3
# TABLE-NEXT: EOF -: reduce by rule 0 'expr := expr - expr'
# TABLE-NEXT: IDENTIFIER: shift state 2
# TABLE-NEXT: expr: go to state 5
# TABLE-NEXT: State 5
# TABLE-NEXT: -: shift state 4
# TABLE-NEXT: EOF -: reduce by rule 1 'expr := expr - expr'

View File

@@ -54,7 +54,7 @@ protected:
TEST_F(ForestTest, DumpBasic) {
build(R"cpp(
_ := add-expression
_ := add-expression EOF
add-expression := id-expression + id-expression
id-expression := IDENTIFIER
)cpp");
@@ -64,7 +64,7 @@ TEST_F(ForestTest, DumpBasic) {
cook(lex("a + b", clang::LangOptions()), clang::LangOptions());
auto T = Arena.createTerminals(TS);
ASSERT_EQ(T.size(), 3u);
ASSERT_EQ(T.size(), 4u);
const auto *Left = &Arena.createSequence(
symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
const auto *Right = &Arena.createSequence(symbol("id-expression"),
@@ -89,9 +89,9 @@ TEST_F(ForestTest, DumpBasic) {
TEST_F(ForestTest, DumpAmbiguousAndRefs) {
build(R"cpp(
_ := type
type := class-type # rule 3
type := enum-type # rule 4
_ := type EOF
type := class-type # rule 4
type := enum-type # rule 5
class-type := shared-type
enum-type := shared-type
shared-type := IDENTIFIER)cpp");
@@ -100,7 +100,7 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) {
const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());
auto Terminals = Arena.createTerminals(TS);
ASSERT_EQ(Terminals.size(), 1u);
ASSERT_EQ(Terminals.size(), 2u);
const auto *SharedType = &Arena.createSequence(
symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
@@ -109,9 +109,9 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) {
const auto *EnumType = &Arena.createSequence(
symbol("enum-type"), ruleFor("enum-type"), {SharedType});
const auto *Alternative1 =
&Arena.createSequence(symbol("type"), /*RuleID=*/3, {ClassType});
&Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType});
const auto *Alternative2 =
&Arena.createSequence(symbol("type"), /*RuleID=*/4, {EnumType});
&Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType});
const auto *Type =
&Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
EXPECT_EQ(Type->dumpRecursive(G),

View File

@@ -509,7 +509,7 @@ TEST_F(GLRTest, PerfectForestNodeSharing) {
// item `expr := • IDENTIFIER`, and both have different goto states on the
// nonterminal `expr`.
build(R"bnf(
_ := test
_ := test EOF
test := { expr
test := { IDENTIFIER
@@ -548,7 +548,7 @@ TEST_F(GLRTest, GLRReduceOrder) {
// foo should be reduced first, so that in step 2 we have completed reduces
// for test, and form an ambiguous forest node.
build(R"bnf(
_ := test
_ := test EOF
test := IDENTIFIER
test := foo
@@ -575,7 +575,7 @@ TEST_F(GLRTest, RecoveryEndToEnd) {
// - multiple possible recovery rules
// - recovery from outer scopes is rejected
build(R"bnf(
_ := block
_ := block EOF
block := { block [recover=Braces] }
block := { numbers [recover=Braces] }
@@ -606,14 +606,14 @@ TEST_F(GLRTest, RecoveryEndToEnd) {
TEST_F(GLRTest, RecoverTerminal) {
build(R"bnf(
_ := stmt
_ := stmt EOF
stmt := IDENTIFIER ; [recover=Skip]
)bnf");
TestLang.Table = LRTable::buildSLR(TestLang.G);
TestLang.RecoveryStrategies.try_emplace(
extensionID("Skip"),
[](Token::Index Start, const TokenStream &) { return Start + 1; });
[](Token::Index Start, const TokenStream &) { return Start; });
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
@@ -630,7 +630,7 @@ TEST_F(GLRTest, RecoverUnrestrictedReduce) {
// We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
build(R"bnf(
_ := sentence
_ := sentence EOF
word := IDENTIFIER
sentence := word word [recover=AcceptAnyTokenInstead]
@@ -652,9 +652,40 @@ TEST_F(GLRTest, RecoverUnrestrictedReduce) {
"[ 1, end) └─word := <opaque>\n");
}
TEST_F(GLRTest, RepeatedRecovery) {
// We require multiple steps of recovery at eof and then a reduction in order
// to successfully parse.
build(R"bnf(
_ := function EOF
# FIXME: this forces EOF to be in follow(signature).
# Remove it once we use unconstrained reduction for recovery.
_ := signature EOF
function := signature body [recover=Skip]
signature := IDENTIFIER params [recover=Skip]
params := ( )
body := { }
)bnf");
TestLang.Table = LRTable::buildSLR(TestLang.G);
TestLang.RecoveryStrategies.try_emplace(
extensionID("Skip"),
[](Token::Index Start, const TokenStream &) { return Start; });
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("main", LOptions), LOptions);
const ForestNode &Parsed =
glrParse({Tokens, Arena, GSStack}, id("function"), TestLang);
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
"[ 0, end) function := signature body [recover=Skip]\n"
"[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
"[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
"[ 1, 1) │ └─params := <opaque>\n"
"[ 1, end) └─body := <opaque>\n");
}
TEST_F(GLRTest, NoExplicitAccept) {
build(R"bnf(
_ := test
_ := test EOF
test := IDENTIFIER test
test := IDENTIFIER
@@ -677,7 +708,7 @@ TEST_F(GLRTest, NoExplicitAccept) {
TEST_F(GLRTest, GuardExtension) {
build(R"bnf(
_ := start
_ := start EOF
start := IDENTIFIER [guard]
)bnf");