266 lines
8.8 KiB
C++
266 lines
8.8 KiB
C++
//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "clang-pseudo/grammar/LRGraph.h"
|
|
#include "clang-pseudo/grammar/Grammar.h"
|
|
#include "llvm/ADT/DenseSet.h"
|
|
#include "llvm/ADT/Hashing.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/FormatVariadic.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
using ItemSet = std::vector<clang::pseudo::Item>;
|
|
|
|
namespace llvm {
|
|
// Support clang::pseudo::Item as DenseMap keys.
|
|
template <> struct DenseMapInfo<ItemSet> {
|
|
static inline ItemSet getEmptyKey() {
|
|
return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
|
|
}
|
|
static inline ItemSet getTombstoneKey() {
|
|
return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
|
|
}
|
|
static unsigned getHashValue(const ItemSet &I) {
|
|
return llvm::hash_combine_range(I.begin(), I.end());
|
|
}
|
|
static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
|
|
return LHS == RHS;
|
|
}
|
|
};
|
|
} // namespace llvm
|
|
|
|
namespace clang {
|
|
namespace pseudo {
|
|
namespace {
|
|
|
|
struct SortByNextSymbol {
|
|
SortByNextSymbol(const Grammar &G) : G(G) {}
|
|
bool operator()(const Item &L, const Item &R) {
|
|
if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
|
|
return L.next(G) < R.next(G);
|
|
if (L.hasNext() != R.hasNext())
|
|
return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
|
|
return L < R;
|
|
}
|
|
const Grammar &G;
|
|
};
|
|
|
|
// Computes a closure of the given item set S:
|
|
// - extends the given S to contain all options for parsing next token;
|
|
// - nonterminals after a dot are recursively expanded into the begin-state
|
|
// of all production rules that produce that nonterminal;
|
|
//
|
|
// Given
|
|
// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
|
|
// Input = [ E := . T ]
|
|
// returns [ E := . T, T := . n, T := . ( E ) ]
|
|
State closure(ItemSet Queue, const Grammar &G) {
|
|
llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
|
|
// We reuse the passed-by-value Queue as the final result, as it's already
|
|
// initialized to the right elements.
|
|
size_t ItIndex = 0;
|
|
while (ItIndex < Queue.size()) {
|
|
const Item &ExpandingItem = Queue[ItIndex];
|
|
++ItIndex;
|
|
if (!ExpandingItem.hasNext())
|
|
continue;
|
|
|
|
SymbolID NextSym = ExpandingItem.next(G);
|
|
if (pseudo::isToken(NextSym))
|
|
continue;
|
|
auto RRange = G.table().Nonterminals[NextSym].RuleRange;
|
|
for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
|
|
Item NewItem = Item::start(RID, G);
|
|
if (InQueue.insert(NewItem).second) // new
|
|
Queue.push_back(std::move(NewItem));
|
|
}
|
|
}
|
|
Queue.shrink_to_fit();
|
|
llvm::sort(Queue, SortByNextSymbol(G));
|
|
return {std::move(Queue)};
|
|
}
|
|
|
|
// Returns all next (with a dot advanced) kernel item sets, partitioned by the
|
|
// advanced symbol.
|
|
//
|
|
// Given
|
|
// S = [ E := . a b, E := E . - T ]
|
|
// returns [
|
|
// {id(a), [ E := a . b ]},
|
|
// {id(-), [ E := E - . T ]}
|
|
// ]
|
|
std::vector<std::pair<SymbolID, ItemSet>>
|
|
nextAvailableKernelItems(const State &S, const Grammar &G) {
|
|
std::vector<std::pair<SymbolID, ItemSet>> Results;
|
|
llvm::ArrayRef<Item> AllItems = S.Items;
|
|
AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
|
|
while (!AllItems.empty()) {
|
|
SymbolID AdvancedSymbol = AllItems.front().next(G);
|
|
auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
|
|
assert(I.hasNext());
|
|
return I.next(G) == AdvancedSymbol;
|
|
});
|
|
assert(!Batch.empty());
|
|
AllItems = AllItems.drop_front(Batch.size());
|
|
|
|
// Advance a dot over the Symbol.
|
|
ItemSet Next;
|
|
for (const Item &I : Batch)
|
|
Next.push_back(I.advance());
|
|
// sort the set to keep order determinism for hash computation.
|
|
llvm::sort(Next);
|
|
Results.push_back({AdvancedSymbol, std::move(Next)});
|
|
}
|
|
return Results;
|
|
}
|
|
|
|
std::vector<std::pair<ExtensionID, SymbolID>>
|
|
availableRecovery(const State &S, const Grammar &G) {
|
|
std::vector<std::pair<ExtensionID, SymbolID>> Result;
|
|
for (const Item &I : S.Items) {
|
|
const auto &Rule = G.lookupRule(I.rule());
|
|
if (I.dot() != Rule.RecoveryIndex)
|
|
continue;
|
|
Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
|
|
}
|
|
llvm::sort(Result);
|
|
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
|
|
return Result;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
std::string Item::dump(const Grammar &G) const {
|
|
const auto &Rule = G.lookupRule(RID);
|
|
auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
|
|
std::vector<llvm::StringRef> Results;
|
|
for (auto SID : Syms)
|
|
Results.push_back(G.symbolName(SID));
|
|
return Results;
|
|
};
|
|
return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
|
|
llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
|
|
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
|
|
Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
|
|
.str();
|
|
}
|
|
|
|
std::string State::dump(const Grammar &G, unsigned Indent) const {
|
|
std::string Result;
|
|
llvm::raw_string_ostream OS(Result);
|
|
for (const auto &Item : Items)
|
|
OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
|
|
return OS.str();
|
|
}
|
|
|
|
std::string LRGraph::dumpForTests(const Grammar &G) const {
|
|
std::string Result;
|
|
llvm::raw_string_ostream OS(Result);
|
|
OS << "States:\n";
|
|
for (StateID ID = 0; ID < States.size(); ++ID) {
|
|
OS << llvm::formatv("State {0}\n", ID);
|
|
OS << States[ID].dump(G, /*Indent*/ 4);
|
|
}
|
|
for (const auto &E : Edges) {
|
|
OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
|
|
E.Dst);
|
|
}
|
|
return OS.str();
|
|
}
|
|
|
|
LRGraph LRGraph::buildLR0(const Grammar &G) {
|
|
class Builder {
|
|
public:
|
|
Builder(const Grammar &G) : G(G) {}
|
|
|
|
// Adds a given state if not existed.
|
|
std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
|
|
assert(llvm::is_sorted(KernelItems) &&
|
|
"Item must be sorted before inserting to a hash map!");
|
|
auto It = StatesIndex.find(KernelItems);
|
|
if (It != StatesIndex.end())
|
|
return {It->second, false};
|
|
States.push_back(closure(KernelItems, G));
|
|
StateID NextStateID = States.size() - 1;
|
|
StatesIndex.insert({std::move(KernelItems), NextStateID});
|
|
return {NextStateID, true};
|
|
}
|
|
|
|
void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
|
|
Edges.push_back({Src, Dst, Label});
|
|
}
|
|
|
|
void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) {
|
|
Recoveries.push_back({Src, Strategy, Result});
|
|
}
|
|
|
|
// Returns a state with the given id.
|
|
const State &find(StateID ID) const {
|
|
assert(ID < States.size());
|
|
return States[ID];
|
|
}
|
|
|
|
void addStartState(SymbolID Sym, StateID State) {
|
|
StartStates.push_back({Sym, State});
|
|
}
|
|
|
|
LRGraph build() && {
|
|
States.shrink_to_fit();
|
|
Edges.shrink_to_fit();
|
|
Recoveries.shrink_to_fit();
|
|
llvm::sort(StartStates);
|
|
StartStates.shrink_to_fit();
|
|
return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
|
|
std::move(StartStates));
|
|
}
|
|
|
|
private:
|
|
// Key is the **kernel** item sets.
|
|
llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
|
|
std::vector<State> States;
|
|
std::vector<Edge> Edges;
|
|
std::vector<Recovery> Recoveries;
|
|
const Grammar &G;
|
|
std::vector<std::pair<SymbolID, StateID>> StartStates;
|
|
} Builder(G);
|
|
|
|
std::vector<StateID> PendingStates;
|
|
// Initialize states with the start symbol.
|
|
auto RRange = G.table().Nonterminals[G.underscore()].RuleRange;
|
|
for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
|
|
auto StartState = std::vector<Item>{Item::start(RID, G)};
|
|
auto Result = Builder.insert(std::move(StartState));
|
|
assert(Result.second && "State must be new");
|
|
PendingStates.push_back(Result.first);
|
|
|
|
const Rule &StartRule = G.lookupRule(RID);
|
|
assert(StartRule.Size == 2 &&
|
|
StartRule.seq().back() == tokenSymbol(tok::eof) &&
|
|
"Start rule must be of the form `_ := start-symbol EOF`!");
|
|
Builder.addStartState(StartRule.seq().front(), Result.first);
|
|
}
|
|
|
|
while (!PendingStates.empty()) {
|
|
auto StateID = PendingStates.back();
|
|
PendingStates.pop_back();
|
|
for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
|
|
auto Insert = Builder.insert(Next.second);
|
|
if (Insert.second) // new state, insert to the pending queue.
|
|
PendingStates.push_back(Insert.first);
|
|
Builder.insertEdge(StateID, Insert.first, Next.first);
|
|
}
|
|
for (auto Recovery : availableRecovery(Builder.find(StateID), G))
|
|
Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
|
|
}
|
|
return std::move(Builder).build();
|
|
}
|
|
|
|
} // namespace pseudo
|
|
} // namespace clang
|