//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===// // // Copyright 2019 The MLIR Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= // // This file implements the lexer for the MLIR textual form. // //===----------------------------------------------------------------------===// #include "Lexer.h" #include "llvm/Support/SourceMgr.h" using namespace mlir; using llvm::SMLoc; using llvm::SourceMgr; // Returns true if 'c' is an allowable puncuation character: [$._-] // Returns false otherwise. static bool isPunct(char c) { return c == '$' || c == '.' || c == '_' || c == '-'; } Lexer::Lexer(llvm::SourceMgr &sourceMgr, const SMDiagnosticHandlerTy &errorReporter) : sourceMgr(sourceMgr), errorReporter(errorReporter) { auto bufferID = sourceMgr.getMainFileID(); curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer(); curPtr = curBuffer.begin(); } /// emitError - Emit an error message and return an Token::error token. Token Lexer::emitError(const char *loc, const Twine &message) { errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc), SourceMgr::DK_Error, message)); return formToken(Token::error, loc); } Token Lexer::lexToken() { const char *tokStart = curPtr; switch (*curPtr++) { default: // Handle bare identifiers. if (isalpha(curPtr[-1])) return lexBareIdentifierOrKeyword(tokStart); // Unknown character, emit an error. return emitError(tokStart, "unexpected character"); case 0: // This may either be a nul character in the source file or may be the EOF // marker that llvm::MemoryBuffer guarantees will be there. if (curPtr-1 == curBuffer.end()) return formToken(Token::eof, tokStart); LLVM_FALLTHROUGH; case ' ': case '\t': case '\n': case '\r': // Ignore whitespace. return lexToken(); case ':': return formToken(Token::colon, tokStart); case ',': return formToken(Token::comma, tokStart); case '(': return formToken(Token::l_paren, tokStart); case ')': return formToken(Token::r_paren, tokStart); case '{': return formToken(Token::l_brace, tokStart); case '}': return formToken(Token::r_brace, tokStart); case '[': return formToken(Token::l_bracket, tokStart); case ']': return formToken(Token::r_bracket, tokStart); case '<': return formToken(Token::less, tokStart); case '>': return formToken(Token::greater, tokStart); case '=': return formToken(Token::equal, tokStart); case '+': return formToken(Token::plus, tokStart); case '*': return formToken(Token::star, tokStart); case '-': if (*curPtr == '>') { ++curPtr; return formToken(Token::arrow, tokStart); } return emitError(tokStart, "unexpected character"); case '?': if (*curPtr == '?') { ++curPtr; return formToken(Token::questionquestion, tokStart); } return formToken(Token::question, tokStart); case ';': return lexComment(); case '@': return lexAtIdentifier(tokStart); case '#': return lexAffineMapId(tokStart); case '"': return lexString(tokStart); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumber(tokStart); } } /// Lex a comment line, starting with a semicolon. /// /// TODO: add a regex for comments here and to the spec. /// Token Lexer::lexComment() { while (true) { switch (*curPtr++) { case '\n': case '\r': // Newline is end of comment. return lexToken(); case 0: // If this is the end of the buffer, end the comment. if (curPtr-1 == curBuffer.end()) { --curPtr; return lexToken(); } LLVM_FALLTHROUGH; default: // Skip over other characters. break; } } } /// Lex a bare identifier or keyword that starts with a letter. /// /// bare-id ::= letter (letter|digit|[_])* /// Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) { // Match the rest of the identifier regex: [0-9a-zA-Z_]* while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_') ++curPtr; // Check to see if this identifier is a keyword. StringRef spelling(tokStart, curPtr-tokStart); Token::Kind kind = llvm::StringSwitch(spelling) #define TOK_KEYWORD(SPELLING) \ .Case(#SPELLING, Token::kw_##SPELLING) #include "TokenKinds.def" .Default(Token::bare_identifier); return Token(kind, spelling); } /// Lex an '@foo' identifier. /// /// function-id ::= `@` bare-id /// Token Lexer::lexAtIdentifier(const char *tokStart) { // These always start with a letter. if (!isalpha(*curPtr++)) return emitError(curPtr-1, "expected letter in @ identifier"); while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_') ++curPtr; return formToken(Token::at_identifier, tokStart); } /// Lex an '#foo' identifier. /// /// affine-map-id ::= `#` suffix-id /// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)* /// // TODO(andydavis) Consider moving suffix-id parsing to a shared function // so it can be re-used to parse %suffix-id. Token Lexer::lexAffineMapId(const char *tokStart) { // Parse suffix-id. if (isdigit(*curPtr)) { // If suffix-id starts with a digit, the rest must be digits. while (isdigit(*curPtr)) { ++curPtr; } } else if (isalpha(*curPtr) || isPunct(*curPtr)) { do { ++curPtr; } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr)); } else { return emitError(curPtr-1, "invalid affine map id"); } return formToken(Token::affine_map_identifier, tokStart); } /// Lex an integer literal. /// /// integer-literal ::= digit+ | `0x` hex_digit+ /// Token Lexer::lexNumber(const char *tokStart) { assert(isdigit(curPtr[-1])); // Handle the hexadecimal case. if (curPtr[-1] == '0' && *curPtr == 'x') { ++curPtr; if (!isxdigit(*curPtr)) return emitError(curPtr, "expected hexadecimal digit"); while (isxdigit(*curPtr)) ++curPtr; return formToken(Token::integer, tokStart); } // Handle the normal decimal case. while (isdigit(*curPtr)) ++curPtr; return formToken(Token::integer, tokStart); } /// Lex a string literal. /// /// string-literal ::= '"' [^"\n\f\v\r]* '"' /// /// TODO: define escaping rules. Token Lexer::lexString(const char *tokStart) { assert(curPtr[-1] == '"'); while (1) { switch (*curPtr++) { case '"': return formToken(Token::string, tokStart); case '0': // If this is a random nul character in the middle of a string, just // include it. If it is the end of file, then it is an error. if (curPtr-1 != curBuffer.end()) continue; LLVM_FALLTHROUGH; case '\n': case '\v': case '\f': return emitError(curPtr-1, "expected '\"' in string literal"); default: continue; } } }