diff --git a/flang/preprocessor.cc b/flang/preprocessor.cc new file mode 100644 index 000000000000..c2dbeaf1f27c --- /dev/null +++ b/flang/preprocessor.cc @@ -0,0 +1,421 @@ +#include "preprocessor.h" +#include "char-buffer.h" +#include "idioms.h" +#include "prescan.h" +#include +#include +#include +#include + +namespace Fortran { + +void TokenSequence::Append(const TokenSequence &that) { + if (nextStart_ < char_.size()) { + start_.push_back(nextStart_); + } + int offset = char_.size(); + for (int st : that.start_) { + start_.push_back(st + offset); + } + char_.insert(char_.end(), that.char_.begin(), that.char_.end()); + nextStart_ = char_.size(); +} + +void TokenSequence::Emit(CharBuffer *out) { + out->Put(char_); +} + +Definition::Definition(const TokenSequence &repl, size_t firstToken, + size_t tokens) + : replacement_{Tokenize({}, repl, firstToken, tokens)} {} + +Definition::Definition(const std::vector &argNames, + const TokenSequence &repl, size_t firstToken, + size_t tokens) + : isFunctionLike_{true}, argumentCount_(argNames.size()), + replacement_{Tokenize(argNames, repl, firstToken, tokens)} {} + +bool Definition::set_isDisabled(bool disable) { + bool was{isDisabled_}; + isDisabled_ = disable; + return was; +} + +TokenSequence Definition::Tokenize(const std::vector &argNames, + const TokenSequence &token, + size_t firstToken, size_t tokens) { + std::map args; + char argIndex{'A'}; + for (const std::string &arg : argNames) { + CHECK(args.find(arg) == args.end()); + args[arg] = "~"s + argIndex++; + } + TokenSequence result; + bool pasting{false}; + for (size_t j{0}; j < tokens; ++j) { + size_t bytes{token.GetBytes(firstToken + j)}; + if (bytes == 0) { + continue; + } + const char *text{token.GetText(firstToken + j)}; + if (bytes == 2 && text[0] == '#' && text[1] == '#') { + for (size_t rtc{result.size()}; + rtc > 0 && (result.GetBytes(rtc-1) == 0 || + *result.GetText(rtc-1) == ' '); + --rtc) { + result.pop_back(); + } + pasting = true; + continue; + } + if (*text == ' ') { + if (pasting) { + continue; + } + } else { + pasting = false; + if (bytes > 0 && (*text == '_' || isalpha(*text))) { + auto it = args.find(token.GetString(firstToken + j)); + if (it != args.end()) { + result.push_back(it->second); + continue; + } + } + } + result.push_back(text, bytes); + } + return result; +} + +TokenSequence Definition::Apply(const std::vector &args) { + TokenSequence result; + bool stringify{false}; + size_t tokens{replacement_.size()}; + for (size_t j{0}; j < tokens; ++j) { + size_t bytes{replacement_.GetBytes(j)}; + const char *text{replacement_.GetText(j)}; + if (bytes == 2 && *text == '~') { + size_t index = text[1] - 'A'; + if (index >= args.size()) { + continue; + } + size_t argTokens{args[index].size()}; + if (stringify) { + std::string strung{'"'}; + for (size_t k{0}; k < argTokens; ++k) { + size_t argBytes{args[index].GetBytes(k)}; + const char *arg{args[index].GetText(k)}; + for (size_t n{0}; n < argBytes; ++n) { + char ch{arg[n]}; + if (ch == '"' || ch == '\\') { + strung += '\\'; + } + strung += ch; + } + } + strung += '"'; + result.pop_back(); // remove the '#' + result.push_back(strung); + } else { + for (size_t k{0}; k < argTokens; ++k) { + result.push_back(args[index].GetText(k), args[index].GetBytes(k)); + } + } + } else { + stringify = bytes == 1 && *text == '#'; + result.push_back(text, bytes); + } + } + return result; +} + +bool Preprocessor::MacroReplacement(const TokenSequence &input, + TokenSequence *result) { + // Do quick scan for any use of a defined name. + if (definitions_.empty()) { + return false; + } + size_t tokens{input.size()}; + size_t j; + for (j = 0; j < tokens; ++j) { + const char *text{input.GetText(j)}; + size_t bytes{input.GetBytes(j)}; + if (bytes > 0 && + (*text == '_' || isalpha(*text)) && + definitions_.find(CharPointerWithLength{text, bytes}) != + definitions_.end()) { + break; + } + } + if (j == tokens) { + return false; // nothing appeared that could be replaced + } + + for (size_t k{0}; k < j; ++k) { + result->push_back(input.GetToken(k)); + } + for (; j < tokens; ++j) { + size_t bytes{input.GetBytes(j)}; + const char *text{input.GetText(j)}; + if (bytes == 0 || (!isalpha(*text) && *text != '_')) { + result->push_back(text, bytes); + continue; + } + auto it = definitions_.find(CharPointerWithLength{text, bytes}); + if (it == definitions_.end()) { + result->push_back(text, bytes); + continue; + } + Definition &def{it->second}; + if (def.isDisabled()) { + result->push_back(text, bytes); + continue; + } + if (!def.isFunctionLike()) { + def.set_isDisabled(true); + TokenSequence repl; + result->Append(MacroReplacement(def.replacement(), &repl) ? repl + : def.replacement()); + def.set_isDisabled(false); + continue; + } + // Possible function-like macro call. Skip spaces and newlines to see + // whether '(' is next. + size_t k{j}; + bool leftParen{false}; + while (++k < tokens) { + size_t bytes{input.GetBytes(k)}; + const char *text{input.GetText(k)}; + if (bytes > 0 && *text != ' ' && *text != '\n') { + leftParen = bytes == 1 && *text == '('; + break; + } + } + if (!leftParen) { + result->push_back(text, bytes); + continue; + } + std::vector argStart{++k}; + for (int nesting{0}; k < tokens; ++k) { + size_t bytes{input.GetBytes(k)}; + const char *text{input.GetText(k)}; + if (bytes == 1 && *text == '(') { + ++nesting; + } else if (bytes == 1 && *text == ')') { + if (nesting == 0) { + break; + } + --nesting; + } else if (bytes == 1 && *text == ',' && nesting == 0) { + argStart.push_back(k + 1); + } + } + if (k >= tokens || + argStart.size() != def.argumentCount()) { + result->push_back(text, bytes); + continue; + } + j = k; // advance to the terminal ')' + std::vector args; + for (k = 0; k < argStart.size(); ++k) { + size_t at{argStart[k]}; + size_t count{(k + 1 == argStart.size() ? j : argStart[k+1] - 1) - at}; + TokenSequence actual; + for (; count-- > 0; ++at) { + actual.push_back(input.GetText(at), input.GetBytes(at)); + } + TokenSequence arg; + if (!MacroReplacement(actual, &arg)) { + args.emplace_back(std::move(actual)); + } else { + args.emplace_back(std::move(arg)); + } + } + TokenSequence repl{def.Apply(args)}; + def.set_isDisabled(true); + TokenSequence rescanned; + result->Append(MacroReplacement(repl, &rescanned) ? rescanned : repl); + def.set_isDisabled(false); + } + return true; +} + +static size_t SkipBlanks(const TokenSequence &token, size_t at) { + for (; at < token.size(); ++at) { + if (token.GetBytes(at) > 0 && *token.GetText(at) != ' ') { + break; + } + } + return at; +} + +static std::string GetDirectiveName(const TokenSequence &line) { + size_t tokens{line.size()}; + size_t j{SkipBlanks(line, 0)}; + if (j == tokens || line.GetString(j) != "#") { + return ""s; + } + j = SkipBlanks(line, j + 1); + if (j == tokens) { + return ""s; + } + return line.GetString(j); +} + +std::string Preprocessor::Directive(const TokenSequence &dir) { + size_t tokens{dir.size()}; + size_t j{SkipBlanks(dir, 0)}; + if (j == tokens) { + return ""s; + } + if (dir.GetString(j) != "#") { + return "missing '#'"s; + } + j = SkipBlanks(dir, j + 1); + if (j == tokens) { + return ""s; + } + if (isdigit(*dir.GetText(j)) || *dir.GetText(j) == '"') { + return ""s; // TODO: treat as #line + } + std::string dirName{dir.GetString(j)}; + j = SkipBlanks(dir, j + 1); + std::string nameString; + CharPointerWithLength nameToken; + if (j < tokens && (isalpha(*dir.GetText(j)) || *dir.GetText(j) == '_')) { + nameString = dir.GetString(j); + nameToken = dir.GetToken(j); + } + if (dirName == "define") { + if (nameToken.empty()) { + return "#define: missing or invalid name"s; + } + // Get a pointer to a "permanent" copy of the name for use as the + // key in the definitions_ map. + names_.push_back(nameString); + nameToken = CharPointerWithLength{names_.back().data(), + names_.back().size()}; + definitions_.erase(nameToken); + if (++j < tokens && dir.GetBytes(j) == 1 && *dir.GetText(j) == '(') { + j = SkipBlanks(dir, j + 1); + std::vector argName; + if (dir.GetString(j) != ")") { + while (true) { + std::string an{dir.GetString(j)}; + if (an.empty() || (an[0] != '_' && !isalpha(an[0]))) { + return "#define: missing or invalid argument name"s; + } + argName.push_back(an); + j = SkipBlanks(dir, j + 1); + if (j == tokens) { + return "#define: malformed argument list"s; + } + std::string punc{dir.GetString(j)}; + if (punc == ")") { + break; + } + if (punc != ",") { + return "#define: malformed argument list"s; + } + j = SkipBlanks(dir, j + 1); + if (j == tokens) { + return "#define: malformed argument list"s; + } + } + if (std::set(argName.begin(), argName.end()).size() != + argName.size()) { + return "#define: argument names are not distinct"s; + } + } + j = SkipBlanks(dir, j + 1); + definitions_.emplace( + std::make_pair(nameToken, Definition{argName, dir, j, tokens - j})); + } else { + j = SkipBlanks(dir, j + 1); + definitions_.emplace( + std::make_pair(nameToken, Definition{dir, j, tokens - j})); + } + return ""s; + } + if (dirName == "undef") { + if (nameToken.empty()) { + return "#undef: missing or invalid name"s; + } + j = SkipBlanks(dir, j + 1); + if (j != tokens) { + return "#undef: excess tokens at end of directive"s; + } + definitions_.erase(nameToken); + return ""s; + } + if (dirName == "ifdef" || dirName == "ifndef") { + if (nameToken.empty()) { + return "#"s + dirName + ": missing name"; + } + j = SkipBlanks(dir, j + 1); + if (j != tokens) { + return "#"s + dirName + ": excess tokens at end of directive"; + } + auto it = definitions_.find(nameToken); + if ((it != definitions_.end()) == (dirName == "ifdef")) { + ifStack_.push(true); // #else / #elsif allowed + return {}; + } + int nesting{0}; + while (std::optional + line{prescanner_->NextTokenizedLine()}) { + std::string dn{GetDirectiveName(*line)}; + if (dn == "ifdef" || dn == "ifndef" || dn == "if") { + ++nesting; + } else if (dn == "endif") { + if (nesting-- == 0) { + return ""s; + } + } else if (dn == "else" && nesting == 0) { + ifStack_.push(false); + return ""s; + } // TODO: #elsif + } + return "#"s + dirName + ": missing #endif"; + } + if (dirName == "else") { + j = SkipBlanks(dir, j); + if (j != tokens) { + return "#else: excess tokens at end of directive"s; + } + if (ifStack_.empty()) { + return "#else: no #if, #ifdef, or #ifndef"s; + } + if (!ifStack_.top()) { + return "#else: already appeared in this #if, #ifdef, or #ifndef"s; + } + ifStack_.pop(); + int nesting{0}; + while (std::optional + line{prescanner_->NextTokenizedLine()}) { + std::string dn{GetDirectiveName(*line)}; + if (dn == "ifdef" || dn == "ifndef" || dn == "if") { + ++nesting; + } else if (dn == "endif") { + if (nesting-- == 0) { + return ""s; + } + } + } + return "#else: missing #endif"s; + } + // TODO: #if, #elsif with macro replacement on expressions + if (dirName == "endif") { + j = SkipBlanks(dir, j); + if (j != tokens) { + return "#endif: excess tokens at end of directive"s; + } + if (ifStack_.empty()) { + return "#endif: no #if, #ifdef, or #ifndef"s; + } + ifStack_.pop(); + return ""s; + } + return "#"s + dirName + ": unknown or unimplemented directive"; +} +} // namespace Fortran diff --git a/flang/preprocessor.h b/flang/preprocessor.h new file mode 100644 index 000000000000..eda9839fec3e --- /dev/null +++ b/flang/preprocessor.h @@ -0,0 +1,213 @@ +#ifndef FORTRAN_PREPROCESSOR_H_ +#define FORTRAN_PREPROCESSOR_H_ + +// A Fortran-aware preprocessing module used by the prescanner to implement +// preprocessing directives and macro replacement. Intended to be efficient +// enough to always run on all source files even when no preprocessing is +// needed, so that special compiler command options &/or source file name +// extensions for preprocessing will not be necessary. + +#include "idioms.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Fortran { + +class CharBuffer; +class Prescanner; + +// Just a const char pointer with an associated length; does not own the +// referenced data. Used to describe buffered tokens and hash table keys. +struct CharPointerWithLength { + CharPointerWithLength() {} + CharPointerWithLength(const char *x, size_t n) : data{x}, bytes{n} {} + CharPointerWithLength(const CharPointerWithLength &that) + : data{that.data}, bytes{that.bytes} {} + CharPointerWithLength &operator=(const CharPointerWithLength &that) { + data = that.data; + bytes = that.bytes; + return *this; + } + + bool empty() const { return bytes == 0; } + size_t size() const { return bytes; } + const char &operator[](size_t j) const { return data[j]; } + + const char *data{nullptr}; + size_t bytes{0}; +}; +} // namespace Fortran + +// Specializations to enable std::unordered_map +template<> struct std::hash { + size_t operator()(const Fortran::CharPointerWithLength &x) const { + size_t hash{0}; + const char *p{x.data}, *limit{p + x.bytes}; + for (; p < limit; ++p) { + hash = (hash * 31) ^ *p; + } + return hash; + } +}; + +template<> struct std::equal_to { + bool operator()(const Fortran::CharPointerWithLength &x, + const Fortran::CharPointerWithLength &y) const { + return x.bytes == y.bytes && + std::memcmp(static_cast(x.data), + static_cast(y.data), + x.bytes) == 0; + } +}; + +namespace Fortran { + +// Buffers a contiguous sequence of characters that has been partitioned into +// a sequence of preprocessing tokens. +class TokenSequence { + public: + TokenSequence() {} + TokenSequence(TokenSequence &&that) + : start_{std::move(that.start_)}, nextStart_{that.nextStart_}, + char_{std::move(that.char_)} {} + TokenSequence &operator=(TokenSequence &&that) { + start_ = std::move(that.start_); + nextStart_ = that.nextStart_; + char_ = std::move(that.char_); + return *this; + } + + size_t GetBytes(size_t token) const { + return (token + 1 >= start_.size() ? char_.size() : start_[token + 1]) - + start_[token]; + } + const char *GetText(size_t token) const { + return &char_[start_[token]]; + } + std::string GetString(size_t token) const { + return std::string(GetText(token), GetBytes(token)); + } + CharPointerWithLength GetToken(size_t token) const { + return {GetText(token), GetBytes(token)}; + } + + void AddChar(char ch) { + char_.emplace_back(ch); + } + void EndToken() { + // CHECK(char_.size() > nextStart_); + start_.emplace_back(nextStart_); + nextStart_ = char_.size(); + } + + void Append(const TokenSequence &); + + void Emit(CharBuffer *); + + bool empty() const { return start_.empty(); } + + size_t size() const { return start_.size(); } + + void clear() { + start_.clear(); + nextStart_ = 0; + char_.clear(); + } + + void pop_back() { + nextStart_ = start_.back(); + start_.pop_back(); + char_.resize(nextStart_); + } + + void push_back(const char *s, size_t bytes) { + for (size_t j{0}; j < bytes; ++j) { + AddChar(s[j]); + } + EndToken(); + } + + void push_back(const CharPointerWithLength &t) { + for (size_t j{0}; j < t.bytes; ++j) { + AddChar(t[j]); + } + EndToken(); + } + + void push_back(const std::string &s) { + size_t bytes{s.size()}; + for (size_t j{0}; j < bytes; ++j) { + AddChar(s[j]); + } + EndToken(); + } + + void shrink_to_fit() { + start_.shrink_to_fit(); + char_.shrink_to_fit(); + } + + private: + std::vector start_; + size_t nextStart_{0}; + std::vector char_; +}; + +// Defines a macro +class Definition { + public: + Definition(const TokenSequence &, size_t firstToken, size_t tokens); + Definition(const std::vector &argNames, const TokenSequence &, + size_t firstToken, size_t tokens); + + bool isFunctionLike() const { return isFunctionLike_; } + size_t argumentCount() const { return argumentCount_; } + bool isVariadic() const { return isVariadic_; } + bool isDisabled() const { return isDisabled_; } + const TokenSequence &replacement() const { return replacement_; } + + bool set_isDisabled(bool disable); + + TokenSequence Apply(const std::vector &args); + + private: + static TokenSequence Tokenize(const std::vector &argNames, + const TokenSequence &token, size_t firstToken, + size_t tokens); + + bool isFunctionLike_{false}; + size_t argumentCount_{0}; + bool isVariadic_{false}; + bool isDisabled_{false}; + TokenSequence replacement_; +}; + +// Preprocessing state +class Preprocessor { + public: + Preprocessor(Prescanner *ps) : prescanner_{ps} {} + + // When the input contains macros to be replaced, the new token sequence + // is appended to the output and the returned value is true. When + // no macro replacement is necessary, the output is unmodified and the + // return value is false. + bool MacroReplacement(const TokenSequence &, TokenSequence *); + + // Implements a preprocessor directive; returns an error message, or an + // empty string when successful. + std::string Directive(const TokenSequence &); + + private: + std::list names_; + std::unordered_map definitions_; + std::stack ifStack_; + Prescanner *prescanner_; +}; +} // namespace Fortran +#endif // FORTRAN_PREPROCESSOR_H_