[pseudo] Start rules are _ := start-symbol EOF, improve recovery.

Previously we were calling glrRecover() ad-hoc at the end of input. Two main problems with this: - glrRecover() on two separate code paths is inelegant - We may have to recover several times in succession (e.g. to exit from nested scopes), so we need a loop at end-of-file Having an actual shift action for an EOF terminal allows us to handle both concerns in the main shift/recover/reduce loop. This revealed a recovery design bug where recovery could enter a loop by repeatedly choosing the same parent to identically recover from. Addressed this by allowing each node to be used as a recovery base once. Differential Revision: https://reviews.llvm.org/D130550
2022-08-19 15:53:50 +02:00
parent 4d931b6e1e
commit bd5cc6575b
9 changed files with 122 additions and 73 deletions
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -71,6 +71,8 @@ struct GSS {
    LRTable::StateID State;
    // Used internally to track reachability during garbage collection.
    bool GCParity;
+    // Have we already used this node for error recovery? (prevents loops)
+    mutable bool Recovered = false;
    // Number of the parents of this node.
    // The parents hold previous parsed symbols, and may resume control after
    // this node is reduced.
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ b/clang-tools-extra/pseudo/lib/Forest.cpp
@@ -178,7 +178,7 @@ std::string ForestNode::dumpRecursive(const Grammar &G,

 llvm::ArrayRef<ForestNode>
 ForestArena::createTerminals(const TokenStream &Code) {
-  ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size());
+  ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size() + 1);
  size_t Index = 0;
  for (const auto &T : Code.tokens()) {
    new (&Terminals[Index])
@@ -186,6 +186,12 @@ ForestArena::createTerminals(const TokenStream &Code) {
                   /*Start=*/Index, /*TerminalData*/ 0);
    ++Index;
  }
+  // Include an `eof` terminal.
+  // This is important to drive the final shift/recover/reduce loop.
+  new (&Terminals[Index])
+      ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof),
+                 /*Start=*/Index, /*TerminalData*/ 0);
+  ++Index;
  NodeCount = Index;
  return llvm::makeArrayRef(Terminals, Index);
 }
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -95,17 +95,19 @@ void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
  auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
    if (!Seen.insert(N).second)
      return;
-    for (auto Strategy : Lang.Table.getRecovery(N->State)) {
-      Options.push_back(PlaceholderRecovery{
-          NextTok,
-          Strategy.Result,
-          Strategy.Strategy,
-          N,
-          Path,
-      });
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Option: recover " << Lang.G.symbolName(Strategy.Result)
-                 << " at token " << NextTok << "\n");
+    if (!N->Recovered) { // Don't recover the same way twice!
+      for (auto Strategy : Lang.Table.getRecovery(N->State)) {
+        Options.push_back(PlaceholderRecovery{
+            NextTok,
+            Strategy.Result,
+            Strategy.Strategy,
+            N,
+            Path,
+        });
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Option: recover " << Lang.G.symbolName(Strategy.Result)
+                   << " at token " << NextTok << "\n");
+      }
    }
    Path.push_back(N->Payload);
    for (const GSS::Node *Parent : N->parents())
@@ -180,6 +182,7 @@ void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
  // There are various options, including simply breaking ties between options.
  // For now it's obscure enough to ignore.
  for (const PlaceholderRecovery *Option : BestOptions) {
+    Option->RecoveryNode->Recovered = true;
    const ForestNode &Placeholder =
        Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
    LRTable::StateID OldState = Option->RecoveryNode->State;
@@ -587,6 +590,9 @@ private:
    auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
    assert(NextState.has_value() && "goto must succeed after reduce!");
    Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
+    LLVM_DEBUG(llvm::dbgs()
+               << "  Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
+               << "    --> S" << Heads->back()->State << "\n");
    return true;
  }
 };
@@ -638,7 +644,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
      // We discard all heads formed by reduction, and recreate them without
      // this constraint. This may duplicate some nodes, but it's rare.
      LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
-                                 "Re-reducing without lookahead.");
+                                 "Re-reducing without lookahead.\n");
      Heads.resize(HeadsPartition);
      Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));

@@ -662,34 +668,26 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
  }
  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));

-  // The parse was successful if we're in state `_ := start-symbol .`
-  auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol);
-  assert(AcceptState.has_value() && "goto must succeed after start symbol!");
+  // The parse was successful if in state `_ := start-symbol EOF .`
+  // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
+  auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol);
+  assert(AfterStart.has_value() && "goto must succeed after start symbol!");
+  auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof));
+  assert(Accept.has_value() && "shift EOF must succeed!");
  auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
    const ForestNode *Result = nullptr;
    for (const auto *Head : Heads) {
-      if (Head->State == *AcceptState) {
-        assert(Head->Payload->symbol() == StartSymbol);
+      if (Head->State == *Accept) {
+        assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
        assert(Result == nullptr && "multiple results!");
-        Result = Head->Payload;
+        Result = Head->parents().front()->Payload;
+        assert(Result->symbol() == StartSymbol);
      }
    }
    return Result;
  };
  if (auto *Result = SearchForAccept(Heads))
    return *Result;
-  // Failed to parse the input, attempt to run recovery.
-  // FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
-  // More elegant is to include EOF in the token stream, and make the
-  // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
-  // would not be a special case: it show up as a failure to shift the EOF
-  // token.
-  unsigned I = Terminals.size();
-  glrRecover(Heads, I, Params, Lang, NextHeads);
-  Reduce(NextHeads, tokenSymbol(tok::eof));
-  if (auto *Result = SearchForAccept(NextHeads))
-    return *Result;
-
  // We failed to parse the input, returning an opaque forest node for recovery.
  // FIXME: as above, we can add fallback error handling so this is impossible.
  return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
@@ -704,8 +702,10 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
 const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,

                              llvm::ArrayRef<const Node *> Parents) {
-  Node *Result = new (allocate(Parents.size()))
-      Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
+  Node *Result = new (allocate(Parents.size())) Node();
+  Result->State = State;
+  Result->GCParity = GCParity;
+  Result->ParentCount = Parents.size();
  Alive.push_back(Result);
  ++NodesCreated;
  Result->Payload = Symbol;
--- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
+++ b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
@@ -29,9 +29,9 @@
 # We list important nonterminals as start symbols, rather than doing it for all
 # nonterminals by default, this reduces the number of states by 30% and LRTable
 # actions by 16%.
-_ := translation-unit
-_ := statement-seq
-_ := declaration-seq
+_ := translation-unit EOF
+_ := statement-seq EOF
+_ := declaration-seq EOF

 # gram.key
 #! we don't distinguish between namespaces and namespace aliases, as it's hard
--- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
@@ -240,8 +240,9 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
    PendingStates.push_back(Result.first);

    const Rule &StartRule = G.lookupRule(RID);
-    assert(StartRule.Size == 1 &&
-           "Start rule must have exactly one symbol in its body!");
+    assert(StartRule.Size == 2 &&
+           StartRule.seq().back() == tokenSymbol(tok::eof) &&
+           "Start rule must be of the form `_ := start-symbol EOF`!");
    Builder.addStartState(StartRule.seq().front(), Result.first);
  }

--- a/clang-tools-extra/pseudo/test/lr-build-basic.test
+++ b/clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -1,19 +1,21 @@
-_ := expr
+_ := expr EOF
 expr := id
 id := IDENTIFIER

 # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
 #      GRAPH: States:
 # GRAPH-NEXT: State 0
-# GRAPH-NEXT:     _ :=  • expr
+# GRAPH-NEXT:     _ :=  • expr EOF
 # GRAPH-NEXT:     expr :=  • id
 # GRAPH-NEXT:     id :=  • IDENTIFIER
 # GRAPH-NEXT: State 1
-# GRAPH-NEXT:     _ := expr • 
+# GRAPH-NEXT:     _ := expr • EOF
 # GRAPH-NEXT: State 2
 # GRAPH-NEXT:     expr := id • 
 # GRAPH-NEXT: State 3
 # GRAPH-NEXT:     id := IDENTIFIER • 
+# GRAPH-NEXT: State 4
+# GRAPH-NEXT:     _ := expr EOF • 

 # RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
 #      TABLE: LRTable:
@@ -22,7 +24,9 @@ id := IDENTIFIER
 # TABLE-NEXT:     expr: go to state 1
 # TABLE-NEXT:     id: go to state 2
 # TABLE-NEXT: State 1
+# TABLE-NEXT:     EOF: shift state 4
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     EOF: reduce by rule 1 'expr := id'
+# TABLE-NEXT:     EOF: reduce by rule 2 'expr := id'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     EOF: reduce by rule 0 'id := IDENTIFIER'
+# TABLE-NEXT:     EOF: reduce by rule 1 'id := IDENTIFIER'
+# TABLE-NEXT: State 4
--- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ b/clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -1,31 +1,34 @@
-_ := expr
+_ := expr EOF
 expr := expr - expr  # S/R conflict at state 4 on '-' token
 expr := IDENTIFIER

 # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
 #      GRAPH: States
 # GRAPH-NEXT: State 0
+# GRAPH-NEXT:     _ :=  • expr EOF
 # GRAPH-NEXT:     expr :=  • expr - expr
-# GRAPH-NEXT:     _ :=  • expr
 # GRAPH-NEXT:     expr :=  • IDENTIFIER
 # GRAPH-NEXT: State 1
-# GRAPH-NEXT:     _ := expr • 
+# GRAPH-NEXT:     _ := expr • EOF
 # GRAPH-NEXT:     expr := expr • - expr
 # GRAPH-NEXT: State 2
 # GRAPH-NEXT:     expr := IDENTIFIER • 
 # GRAPH-NEXT: State 3
+# GRAPH-NEXT:     _ := expr EOF •
+# GRAPH-NEXT: State 4
 # GRAPH-NEXT:     expr :=  • expr - expr
 # GRAPH-NEXT:     expr := expr - • expr
 # GRAPH-NEXT:     expr :=  • IDENTIFIER
-# GRAPH-NEXT: State 4
+# GRAPH-NEXT: State 5
 # GRAPH-NEXT:     expr := expr - expr • 
 # GRAPH-NEXT:     expr := expr • - expr
 # GRAPH-NEXT: 0 ->[expr] 1
 # GRAPH-NEXT: 0 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 1 ->[-] 3
-# GRAPH-NEXT: 3 ->[expr] 4
-# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 4 ->[-] 3
+# GRAPH-NEXT: 1 ->[EOF] 3
+# GRAPH-NEXT: 1 ->[-] 4
+# GRAPH-NEXT: 4 ->[expr] 5
+# GRAPH-NEXT: 4 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 5 ->[-] 4

 # RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
 #      TABLE: LRTable:
@@ -33,12 +36,14 @@ expr := IDENTIFIER
 # TABLE-NEXT:     IDENTIFIER: shift state 2
 # TABLE-NEXT:     expr: go to state 1
 # TABLE-NEXT: State 1
-# TABLE-NEXT:     -: shift state 3
+# TABLE-NEXT:     EOF: shift state 3
+# TABLE-NEXT:     -: shift state 4
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     EOF -: reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT:     EOF -: reduce by rule 2 'expr := IDENTIFIER'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     IDENTIFIER: shift state 2
-# TABLE-NEXT:     expr: go to state 4
 # TABLE-NEXT: State 4
-# TABLE-NEXT:     -: shift state 3
-# TABLE-NEXT:     EOF -: reduce by rule 0 'expr := expr - expr'
+# TABLE-NEXT:     IDENTIFIER: shift state 2
+# TABLE-NEXT:     expr: go to state 5
+# TABLE-NEXT: State 5
+# TABLE-NEXT:     -: shift state 4
+# TABLE-NEXT:     EOF -: reduce by rule 1 'expr := expr - expr'
--- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
@@ -54,7 +54,7 @@ protected:

 TEST_F(ForestTest, DumpBasic) {
  build(R"cpp(
-    _ := add-expression
+    _ := add-expression EOF
    add-expression := id-expression + id-expression
    id-expression := IDENTIFIER
  )cpp");
@@ -64,7 +64,7 @@ TEST_F(ForestTest, DumpBasic) {
      cook(lex("a + b", clang::LangOptions()), clang::LangOptions());

  auto T = Arena.createTerminals(TS);
-  ASSERT_EQ(T.size(), 3u);
+  ASSERT_EQ(T.size(), 4u);
  const auto *Left = &Arena.createSequence(
      symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
  const auto *Right = &Arena.createSequence(symbol("id-expression"),
@@ -89,9 +89,9 @@ TEST_F(ForestTest, DumpBasic) {

 TEST_F(ForestTest, DumpAmbiguousAndRefs) {
  build(R"cpp(
-    _ := type
-    type := class-type # rule 3
-    type := enum-type # rule 4
+    _ := type EOF
+    type := class-type # rule 4
+    type := enum-type # rule 5
    class-type := shared-type
    enum-type := shared-type
    shared-type := IDENTIFIER)cpp");
@@ -100,7 +100,7 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) {
  const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());

  auto Terminals = Arena.createTerminals(TS);
-  ASSERT_EQ(Terminals.size(), 1u);
+  ASSERT_EQ(Terminals.size(), 2u);

  const auto *SharedType = &Arena.createSequence(
      symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
@@ -109,9 +109,9 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) {
  const auto *EnumType = &Arena.createSequence(
      symbol("enum-type"), ruleFor("enum-type"), {SharedType});
  const auto *Alternative1 =
-      &Arena.createSequence(symbol("type"), /*RuleID=*/3, {ClassType});
+      &Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType});
  const auto *Alternative2 =
-      &Arena.createSequence(symbol("type"), /*RuleID=*/4, {EnumType});
+      &Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType});
  const auto *Type =
      &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
  EXPECT_EQ(Type->dumpRecursive(G),
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -509,7 +509,7 @@ TEST_F(GLRTest, PerfectForestNodeSharing) {
  // item `expr := • IDENTIFIER`, and both have different goto states on the
  // nonterminal `expr`.
  build(R"bnf(
-    _ := test
+    _ := test EOF

    test := { expr
    test := { IDENTIFIER
@@ -548,7 +548,7 @@ TEST_F(GLRTest, GLRReduceOrder) {
  // foo should be reduced first, so that in step 2 we have completed reduces
  // for test, and form an ambiguous forest node.
  build(R"bnf(
-    _ := test
+    _ := test EOF

    test := IDENTIFIER
    test := foo
@@ -575,7 +575,7 @@ TEST_F(GLRTest, RecoveryEndToEnd) {
  //  - multiple possible recovery rules
  //  - recovery from outer scopes is rejected
  build(R"bnf(
-    _ := block
+    _ := block EOF

    block := { block [recover=Braces] }
    block := { numbers [recover=Braces] }
@@ -606,14 +606,14 @@ TEST_F(GLRTest, RecoveryEndToEnd) {

 TEST_F(GLRTest, RecoverTerminal) {
  build(R"bnf(
-    _ := stmt
+    _ := stmt EOF

    stmt := IDENTIFIER ; [recover=Skip]
  )bnf");
  TestLang.Table = LRTable::buildSLR(TestLang.G);
  TestLang.RecoveryStrategies.try_emplace(
      extensionID("Skip"),
-      [](Token::Index Start, const TokenStream &) { return Start + 1; });
+      [](Token::Index Start, const TokenStream &) { return Start; });
  clang::LangOptions LOptions;
  TokenStream Tokens = cook(lex("foo", LOptions), LOptions);

@@ -630,7 +630,7 @@ TEST_F(GLRTest, RecoverUnrestrictedReduce) {
  // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.

  build(R"bnf(
-    _ := sentence
+    _ := sentence EOF

    word := IDENTIFIER
    sentence := word word [recover=AcceptAnyTokenInstead]
@@ -652,9 +652,40 @@ TEST_F(GLRTest, RecoverUnrestrictedReduce) {
            "[  1, end) └─word := <opaque>\n");
 }

+TEST_F(GLRTest, RepeatedRecovery) {
+  // We require multiple steps of recovery at eof and then a reduction in order
+  // to successfully parse.
+  build(R"bnf(
+    _ := function EOF
+    # FIXME: this forces EOF to be in follow(signature).
+    # Remove it once we use unconstrained reduction for recovery.
+    _ := signature EOF
+
+    function := signature body [recover=Skip]
+    signature := IDENTIFIER params [recover=Skip]
+    params := ( )
+    body := { }
+  )bnf");
+  TestLang.Table = LRTable::buildSLR(TestLang.G);
+  TestLang.RecoveryStrategies.try_emplace(
+      extensionID("Skip"),
+      [](Token::Index Start, const TokenStream &) { return Start; });
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("main", LOptions), LOptions);
+
+  const ForestNode &Parsed =
+      glrParse({Tokens, Arena, GSStack}, id("function"), TestLang);
+  EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
+            "[  0, end) function := signature body [recover=Skip]\n"
+            "[  0,   1) ├─signature := IDENTIFIER params [recover=Skip]\n"
+            "[  0,   1) │ ├─IDENTIFIER := tok[0]\n"
+            "[  1,   1) │ └─params := <opaque>\n"
+            "[  1, end) └─body := <opaque>\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
  build(R"bnf(
-    _ := test
+    _ := test EOF

    test := IDENTIFIER test
    test := IDENTIFIER
@@ -677,7 +708,7 @@ TEST_F(GLRTest, NoExplicitAccept) {

 TEST_F(GLRTest, GuardExtension) {
  build(R"bnf(
-    _ := start
+    _ := start EOF

    start := IDENTIFIER [guard]
  )bnf");