[mlir] Make parser not rely on terminating null. #151007

jpienaar · 2025-07-28T17:57:11Z

Used in follow up to parse slices of buffer.

llvmbot · 2025-07-28T18:04:31Z

@llvm/pr-subscribers-mlir-core

Author: Jacques Pienaar (jpienaar)

Changes

Used in follow up to parse slices of buffer.

Full diff: https://github.com/llvm/llvm-project/pull/151007.diff

3 Files Affected:

(modified) mlir/lib/AsmParser/DialectSymbolParser.cpp (+7)
(modified) mlir/lib/AsmParser/Lexer.cpp (+24-3)
(modified) mlir/lib/AsmParser/Lexer.h (+3)

diff --git a/mlir/lib/AsmParser/DialectSymbolParser.cpp b/mlir/lib/AsmParser/DialectSymbolParser.cpp
index 9f4a87a6a02de..8b14e71118c3a 100644
--- a/mlir/lib/AsmParser/DialectSymbolParser.cpp
+++ b/mlir/lib/AsmParser/DialectSymbolParser.cpp
@@ -89,6 +89,7 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
     nestedPunctuation.pop_back();
     return success();
   };
+  const char *curBufferEnd = state.lex.getBufferEnd();
   do {
     // Handle code completions, which may appear in the middle of the symbol
     // body.
@@ -98,6 +99,12 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
       break;
     }
 
+    if (curBufferEnd == curPtr) {
+      if (!nestedPunctuation.empty())
+        return emitPunctError();
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    }
+
     char c = *curPtr++;
     switch (c) {
     case '\0':
diff --git a/mlir/lib/AsmParser/Lexer.cpp b/mlir/lib/AsmParser/Lexer.cpp
index 751bd63e537f8..8f53529823e23 100644
--- a/mlir/lib/AsmParser/Lexer.cpp
+++ b/mlir/lib/AsmParser/Lexer.cpp
@@ -37,6 +37,18 @@ Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
              AsmParserCodeCompleteContext *codeCompleteContext)
     : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
   auto bufferID = sourceMgr.getMainFileID();
+
+  // Check to see if the main buffer contains the last buffer, and if so the
+  // last buffer should be used as main file for parsing.
+  if (sourceMgr.getNumBuffers() > 1) {
+    unsigned lastFileID = sourceMgr.getNumBuffers();
+    const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
+    const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
+    if (main->getBufferStart() <= last->getBufferStart() &&
+        main->getBufferEnd() >= last->getBufferEnd()) {
+      bufferID = lastFileID;
+    }
+  }
   curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
   curPtr = curBuffer.begin();
 
@@ -71,6 +83,7 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
 }
 
 Token Lexer::lexToken() {
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     const char *tokStart = curPtr;
 
@@ -78,6 +91,9 @@ Token Lexer::lexToken() {
     if (tokStart == codeCompleteLoc)
       return formToken(Token::code_complete, tokStart);
 
+    if (tokStart == curBufferEnd)
+      return formToken(Token::eof, tokStart);
+
     // Lex the next token.
     switch (*curPtr++) {
     default:
@@ -102,7 +118,7 @@ Token Lexer::lexToken() {
     case 0:
       // This may either be a nul character in the source file or may be the EOF
       // marker that llvm::MemoryBuffer guarantees will be there.
-      if (curPtr - 1 == curBuffer.end())
+      if (curPtr - 1 == curBufferEnd)
         return formToken(Token::eof, tokStart);
       continue;
 
@@ -259,7 +275,11 @@ void Lexer::skipComment() {
   assert(*curPtr == '/');
   ++curPtr;
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
+    if (curPtr == curBufferEnd)
+      return;
+
     switch (*curPtr++) {
     case '\n':
     case '\r':
@@ -267,7 +287,7 @@ void Lexer::skipComment() {
       return;
     case 0:
       // If this is the end of the buffer, end the comment.
-      if (curPtr - 1 == curBuffer.end()) {
+      if (curPtr - 1 == curBufferEnd) {
         --curPtr;
         return;
       }
@@ -405,6 +425,7 @@ Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
 Token Lexer::lexString(const char *tokStart) {
   assert(curPtr[-1] == '"');
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     // Check to see if there is a code completion location within the string. In
     // these cases we generate a completion location and place the currently
@@ -419,7 +440,7 @@ Token Lexer::lexString(const char *tokStart) {
     case 0:
       // If this is a random nul character in the middle of a string, just
       // include it.  If it is the end of file, then it is an error.
-      if (curPtr - 1 != curBuffer.end())
+      if (curPtr - 1 != curBufferEnd)
         continue;
       [[fallthrough]];
     case '\n':
diff --git a/mlir/lib/AsmParser/Lexer.h b/mlir/lib/AsmParser/Lexer.h
index 4085a9b73854b..670444eb1f5b4 100644
--- a/mlir/lib/AsmParser/Lexer.h
+++ b/mlir/lib/AsmParser/Lexer.h
@@ -40,6 +40,9 @@ class Lexer {
   /// Returns the start of the buffer.
   const char *getBufferBegin() { return curBuffer.data(); }
 
+  /// Returns the end of the buffer.
+  const char *getBufferEnd() { return curBuffer.end(); }
+
   /// Return the code completion location of the lexer, or nullptr if there is
   /// none.
   const char *getCodeCompleteLoc() const { return codeCompleteLoc; }

llvmbot · 2025-07-28T18:04:32Z

@llvm/pr-subscribers-mlir

Author: Jacques Pienaar (jpienaar)

Changes

Used in follow up to parse slices of buffer.

Full diff: https://github.com/llvm/llvm-project/pull/151007.diff

3 Files Affected:

(modified) mlir/lib/AsmParser/DialectSymbolParser.cpp (+7)
(modified) mlir/lib/AsmParser/Lexer.cpp (+24-3)
(modified) mlir/lib/AsmParser/Lexer.h (+3)

diff --git a/mlir/lib/AsmParser/DialectSymbolParser.cpp b/mlir/lib/AsmParser/DialectSymbolParser.cpp
index 9f4a87a6a02de..8b14e71118c3a 100644
--- a/mlir/lib/AsmParser/DialectSymbolParser.cpp
+++ b/mlir/lib/AsmParser/DialectSymbolParser.cpp
@@ -89,6 +89,7 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
     nestedPunctuation.pop_back();
     return success();
   };
+  const char *curBufferEnd = state.lex.getBufferEnd();
   do {
     // Handle code completions, which may appear in the middle of the symbol
     // body.
@@ -98,6 +99,12 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
       break;
     }
 
+    if (curBufferEnd == curPtr) {
+      if (!nestedPunctuation.empty())
+        return emitPunctError();
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    }
+
     char c = *curPtr++;
     switch (c) {
     case '\0':
diff --git a/mlir/lib/AsmParser/Lexer.cpp b/mlir/lib/AsmParser/Lexer.cpp
index 751bd63e537f8..8f53529823e23 100644
--- a/mlir/lib/AsmParser/Lexer.cpp
+++ b/mlir/lib/AsmParser/Lexer.cpp
@@ -37,6 +37,18 @@ Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
              AsmParserCodeCompleteContext *codeCompleteContext)
     : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
   auto bufferID = sourceMgr.getMainFileID();
+
+  // Check to see if the main buffer contains the last buffer, and if so the
+  // last buffer should be used as main file for parsing.
+  if (sourceMgr.getNumBuffers() > 1) {
+    unsigned lastFileID = sourceMgr.getNumBuffers();
+    const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
+    const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
+    if (main->getBufferStart() <= last->getBufferStart() &&
+        main->getBufferEnd() >= last->getBufferEnd()) {
+      bufferID = lastFileID;
+    }
+  }
   curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
   curPtr = curBuffer.begin();
 
@@ -71,6 +83,7 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
 }
 
 Token Lexer::lexToken() {
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     const char *tokStart = curPtr;
 
@@ -78,6 +91,9 @@ Token Lexer::lexToken() {
     if (tokStart == codeCompleteLoc)
       return formToken(Token::code_complete, tokStart);
 
+    if (tokStart == curBufferEnd)
+      return formToken(Token::eof, tokStart);
+
     // Lex the next token.
     switch (*curPtr++) {
     default:
@@ -102,7 +118,7 @@ Token Lexer::lexToken() {
     case 0:
       // This may either be a nul character in the source file or may be the EOF
       // marker that llvm::MemoryBuffer guarantees will be there.
-      if (curPtr - 1 == curBuffer.end())
+      if (curPtr - 1 == curBufferEnd)
         return formToken(Token::eof, tokStart);
       continue;
 
@@ -259,7 +275,11 @@ void Lexer::skipComment() {
   assert(*curPtr == '/');
   ++curPtr;
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
+    if (curPtr == curBufferEnd)
+      return;
+
     switch (*curPtr++) {
     case '\n':
     case '\r':
@@ -267,7 +287,7 @@ void Lexer::skipComment() {
       return;
     case 0:
       // If this is the end of the buffer, end the comment.
-      if (curPtr - 1 == curBuffer.end()) {
+      if (curPtr - 1 == curBufferEnd) {
         --curPtr;
         return;
       }
@@ -405,6 +425,7 @@ Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
 Token Lexer::lexString(const char *tokStart) {
   assert(curPtr[-1] == '"');
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     // Check to see if there is a code completion location within the string. In
     // these cases we generate a completion location and place the currently
@@ -419,7 +440,7 @@ Token Lexer::lexString(const char *tokStart) {
     case 0:
       // If this is a random nul character in the middle of a string, just
       // include it.  If it is the end of file, then it is an error.
-      if (curPtr - 1 != curBuffer.end())
+      if (curPtr - 1 != curBufferEnd)
         continue;
       [[fallthrough]];
     case '\n':
diff --git a/mlir/lib/AsmParser/Lexer.h b/mlir/lib/AsmParser/Lexer.h
index 4085a9b73854b..670444eb1f5b4 100644
--- a/mlir/lib/AsmParser/Lexer.h
+++ b/mlir/lib/AsmParser/Lexer.h
@@ -40,6 +40,9 @@ class Lexer {
   /// Returns the start of the buffer.
   const char *getBufferBegin() { return curBuffer.data(); }
 
+  /// Returns the end of the buffer.
+  const char *getBufferEnd() { return curBuffer.end(); }
+
   /// Return the code completion location of the lexer, or nullptr if there is
   /// none.
   const char *getCodeCompleteLoc() const { return codeCompleteLoc; }

[mlir] Make parser not rely on terminating null.

215dc7f

Used in follow up to parse slices of buffer.

jpienaar force-pushed the nullterm branch from 748d8bb to 215dc7f Compare July 28, 2025 17:58

jpienaar requested a review from joker-eph July 28, 2025 18:03

jpienaar marked this pull request as ready for review July 28, 2025 18:03

Merge branch 'main' into nullterm

341b028

llvmbot added mlir:core MLIR Core Infrastructure mlir labels Jul 28, 2025

joker-eph approved these changes Jul 28, 2025

View reviewed changes

jpienaar merged commit 217f9e5 into llvm:main Jul 29, 2025
11 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[mlir] Make parser not rely on terminating null. #151007

[mlir] Make parser not rely on terminating null. #151007

Uh oh!

jpienaar commented Jul 28, 2025

Uh oh!

llvmbot commented Jul 28, 2025

Uh oh!

llvmbot commented Jul 28, 2025

Uh oh!

Uh oh!

Uh oh!

[mlir] Make parser not rely on terminating null. #151007

[mlir] Make parser not rely on terminating null. #151007

Uh oh!

Conversation

jpienaar commented Jul 28, 2025

Uh oh!

llvmbot commented Jul 28, 2025

Uh oh!

llvmbot commented Jul 28, 2025

Uh oh!

Uh oh!

Uh oh!