[libc] Templatize the scanf Reader interface (#131037)

This allows specializing the implementation for different targets
without including unnecessary logic and is similar to #111559 which did
the same for printf Writer interface.
This commit is contained in:
Petr Hosek
2025-03-17 23:51:24 -07:00
committed by GitHub
parent 4781941160
commit 1fbfef9b8a
23 changed files with 732 additions and 880 deletions

View File

@@ -117,8 +117,8 @@ add_entrypoint_object(
sscanf.h
DEPENDS
libc.src.__support.arg_list
libc.src.stdio.scanf_core.reader
libc.src.stdio.scanf_core.scanf_main
libc.src.stdio.scanf_core.string_reader
)
add_entrypoint_object(
@@ -129,8 +129,8 @@ add_entrypoint_object(
vsscanf.h
DEPENDS
libc.src.__support.arg_list
libc.src.stdio.scanf_core.reader
libc.src.stdio.scanf_core.scanf_main
libc.src.stdio.scanf_core.string_reader
)
add_entrypoint_object(

View File

@@ -61,10 +61,8 @@ if(NOT(TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD AND
return()
endif()
add_object_library(
add_header_library(
scanf_main
SRCS
scanf_main.cpp
HDRS
scanf_main.h
DEPENDS
@@ -83,18 +81,19 @@ add_header_library(
reader.h
DEPENDS
libc.src.__support.macros.attributes
${file_deps}
${use_system_file}
)
add_object_library(
add_header_library(
string_reader
HDRS
string_reader.h
DEPENDS
.reader
libc.src.__support.macros.attributes
)
add_header_library(
converter
SRCS
converter.cpp
string_converter.cpp
int_converter.cpp
float_converter.cpp
ptr_converter.cpp
HDRS
converter.h
converter_utils.h

View File

@@ -1,103 +0,0 @@
//===-- Format specifier converter implmentation for scanf -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/stdio/scanf_core/converter.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT
#include "src/stdio/scanf_core/float_converter.h"
#endif // LIBC_COPT_SCANF_DISABLE_FLOAT
#include "src/stdio/scanf_core/current_pos_converter.h"
#include "src/stdio/scanf_core/int_converter.h"
#include "src/stdio/scanf_core/ptr_converter.h"
#include "src/stdio/scanf_core/string_converter.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert(Reader *reader, const FormatSection &to_conv) {
int ret_val = 0;
switch (to_conv.conv_name) {
case '%':
return raw_match(reader, "%");
case 's':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_string(reader, to_conv);
case 'c':
case '[':
return convert_string(reader, to_conv);
case 'd':
case 'i':
case 'u':
case 'o':
case 'x':
case 'X':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_int(reader, to_conv);
#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT
case 'f':
case 'F':
case 'e':
case 'E':
case 'a':
case 'A':
case 'g':
case 'G':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_float(reader, to_conv);
#endif // LIBC_COPT_SCANF_DISABLE_FLOAT
case 'n':
return convert_current_pos(reader, to_conv);
case 'p':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_pointer(reader, to_conv);
default:
return raw_match(reader, to_conv.raw_string);
}
return -1;
}
// raw_string is assumed to have a positive size.
int raw_match(Reader *reader, cpp::string_view raw_string) {
char cur_char = reader->getc();
int ret_val = READ_OK;
for (size_t i = 0; i < raw_string.size(); ++i) {
// Any space character matches any number of space characters.
if (internal::isspace(raw_string[i])) {
while (internal::isspace(cur_char)) {
cur_char = reader->getc();
}
} else {
if (raw_string[i] == cur_char) {
cur_char = reader->getc();
} else {
ret_val = MATCHING_FAILURE;
break;
}
}
}
reader->ungetc(cur_char);
return ret_val;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -10,10 +10,19 @@
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CONVERTER_H
#include "src/__support/CPP/string_view.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT
#include "src/stdio/scanf_core/float_converter.h"
#endif // LIBC_COPT_SCANF_DISABLE_FLOAT
#include "src/stdio/scanf_core/current_pos_converter.h"
#include "src/stdio/scanf_core/int_converter.h"
#include "src/stdio/scanf_core/ptr_converter.h"
#include "src/stdio/scanf_core/string_converter.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
@@ -22,11 +31,81 @@ namespace scanf_core {
// convert will call a conversion function to convert the FormatSection into
// its string representation, and then that will write the result to the
// reader.
int convert(Reader *reader, const FormatSection &to_conv);
template <typename T>
int convert(Reader<T> *reader, const FormatSection &to_conv) {
int ret_val = 0;
switch (to_conv.conv_name) {
case '%':
return raw_match(reader, "%");
case 's':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_string(reader, to_conv);
case 'c':
case '[':
return convert_string(reader, to_conv);
case 'd':
case 'i':
case 'u':
case 'o':
case 'x':
case 'X':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_int(reader, to_conv);
#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT
case 'f':
case 'F':
case 'e':
case 'E':
case 'a':
case 'A':
case 'g':
case 'G':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_float(reader, to_conv);
#endif // LIBC_COPT_SCANF_DISABLE_FLOAT
case 'n':
return convert_current_pos(reader, to_conv);
case 'p':
ret_val = raw_match(reader, " ");
if (ret_val != READ_OK)
return ret_val;
return convert_pointer(reader, to_conv);
default:
return raw_match(reader, to_conv.raw_string);
}
return -1;
}
// raw_match takes a raw string and matches it to the characters obtained from
// the reader.
int raw_match(Reader *reader, cpp::string_view raw_string);
template <typename T>
int raw_match(Reader<T> *reader, cpp::string_view raw_string) {
char cur_char = reader->getc();
int ret_val = READ_OK;
for (size_t i = 0; i < raw_string.size(); ++i) {
// Any space character matches any number of space characters.
if (internal::isspace(raw_string[i])) {
while (internal::isspace(cur_char)) {
cur_char = reader->getc();
}
} else {
if (raw_string[i] == cur_char) {
cur_char = reader->getc();
} else {
ret_val = MATCHING_FAILURE;
break;
}
}
}
reader->ungetc(cur_char);
return ret_val;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -19,7 +19,8 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
LIBC_INLINE int convert_current_pos(Reader *reader,
template <typename T>
LIBC_INLINE int convert_current_pos(Reader<T> *reader,
const FormatSection &to_conv) {
write_int_with_length(reader->chars_read(), to_conv);
return READ_OK;

View File

@@ -1,229 +0,0 @@
//===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/stdio/scanf_core/float_converter.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/char_vector.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter_utils.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
// All of the floating point conversions are the same for scanf, every name will
// accept every style.
int convert_float(Reader *reader, const FormatSection &to_conv) {
// %a/A/e/E/f/F/g/G "Matches an optionally signed floating-point number,
// infinity, or NaN, whose format is the same as expected for the subject
// sequence of the strtod function. The corresponding argument shall be a
// pointer to floating."
CharVector out_str = CharVector();
bool is_number = false;
size_t max_width = cpp::numeric_limits<size_t>::max();
if (to_conv.max_width > 0) {
max_width = to_conv.max_width;
}
char cur_char = reader->getc();
// Handle the sign.
if (cur_char == '+' || cur_char == '-') {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
if (out_str.length() == max_width) {
return MATCHING_FAILURE;
} else {
cur_char = reader->getc();
}
}
static constexpr char DECIMAL_POINT = '.';
static const char inf_string[] = "infinity";
// Handle inf
if (internal::tolower(cur_char) == inf_string[0]) {
size_t inf_index = 0;
for (;
inf_index < (sizeof(inf_string) - 1) && out_str.length() < max_width &&
internal::tolower(cur_char) == inf_string[inf_index];
++inf_index) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
}
if (inf_index == 3 || inf_index == sizeof(inf_string) - 1) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
return MATCHING_FAILURE;
}
}
static const char nan_string[] = "nan";
// Handle nan
if (internal::tolower(cur_char) == nan_string[0]) {
size_t nan_index = 0;
for (;
nan_index < (sizeof(nan_string) - 1) && out_str.length() < max_width &&
internal::tolower(cur_char) == nan_string[nan_index];
++nan_index) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
}
if (nan_index == sizeof(nan_string) - 1) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
return MATCHING_FAILURE;
}
}
// Assume base of 10 by default but check if it is actually base 16.
int base = 10;
// If the string starts with 0 it might be in hex.
if (cur_char == '0') {
is_number = true;
// Read the next character to check.
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
// If we've hit the end, then this is "0", which is valid.
if (out_str.length() == max_width) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
cur_char = reader->getc();
}
// If that next character is an 'x' then this is a hexadecimal number.
if (internal::tolower(cur_char) == 'x') {
base = 16;
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
// If we've hit the end here, we have "0x" which is a valid prefix to a
// floating point number, and will be evaluated to 0.
if (out_str.length() == max_width) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
cur_char = reader->getc();
}
}
}
const char exponent_mark = ((base == 10) ? 'e' : 'p');
bool after_decimal = false;
// The format for the remaining characters at this point is DD.DDe+/-DD for
// base 10 and XX.XXp+/-DD for base 16
// This handles the digits before and after the decimal point, but not the
// exponent.
while (out_str.length() < max_width) {
if (internal::isalnum(cur_char) &&
internal::b36_char_to_int(cur_char) < base) {
is_number = true;
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
} else if (cur_char == DECIMAL_POINT && !after_decimal) {
after_decimal = true;
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
} else {
break;
}
}
// Handle the exponent, which has an exponent mark, an optional sign, and
// decimal digits.
if (internal::tolower(cur_char) == exponent_mark) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
if (out_str.length() == max_width) {
// This is laid out in the standard as being a matching error (100e is not
// a valid float) but may conflict with existing implementations.
return MATCHING_FAILURE;
} else {
cur_char = reader->getc();
}
if (cur_char == '+' || cur_char == '-') {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
if (out_str.length() == max_width) {
return MATCHING_FAILURE;
} else {
cur_char = reader->getc();
}
}
// It is specified by the standard that "100er" is a matching failure since
// the longest prefix of a possibly valid floating-point number (which is
// "100e") is not a valid floating-point number. If there is an exponent
// mark then there must be a digit after it else the number is not valid.
// Some implementations will roll back two characters (to just "100") and
// accept that since the prefix is not valid, and some will interpret an
// exponent mark followed by no digits as an additional exponent of 0
// (accepting "100e" and returning 100.0). Both of these behaviors are wrong
// by the standard, but they may be used in real code, see Hyrum's law. This
// code follows the standard, but may be incompatible due to code expecting
// these bugs.
if (!internal::isdigit(cur_char)) {
return MATCHING_FAILURE;
}
while (internal::isdigit(cur_char) && out_str.length() < max_width) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
}
}
// We always read one more character than will be used, so we have to put the
// last one back.
reader->ungetc(cur_char);
// If we haven't actually found any digits, this is a matching failure (this
// catches cases like "+.")
if (!is_number) {
return MATCHING_FAILURE;
}
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -9,7 +9,11 @@
#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_FLOAT_CONVERTER_H
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_FLOAT_CONVERTER_H
#include "src/__support/CPP/limits.h"
#include "src/__support/char_vector.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter_utils.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
@@ -18,7 +22,210 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert_float(Reader *reader, const FormatSection &to_conv);
// All of the floating point conversions are the same for scanf, every name will
// accept every style.
template <typename T>
int convert_float(Reader<T> *reader, const FormatSection &to_conv) {
// %a/A/e/E/f/F/g/G "Matches an optionally signed floating-point number,
// infinity, or NaN, whose format is the same as expected for the subject
// sequence of the strtod function. The corresponding argument shall be a
// pointer to floating."
CharVector out_str = CharVector();
bool is_number = false;
size_t max_width = cpp::numeric_limits<size_t>::max();
if (to_conv.max_width > 0) {
max_width = to_conv.max_width;
}
char cur_char = reader->getc();
// Handle the sign.
if (cur_char == '+' || cur_char == '-') {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
if (out_str.length() == max_width) {
return MATCHING_FAILURE;
} else {
cur_char = reader->getc();
}
}
static constexpr char DECIMAL_POINT = '.';
static const char inf_string[] = "infinity";
// Handle inf
if (internal::tolower(cur_char) == inf_string[0]) {
size_t inf_index = 0;
for (;
inf_index < (sizeof(inf_string) - 1) && out_str.length() < max_width &&
internal::tolower(cur_char) == inf_string[inf_index];
++inf_index) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
}
if (inf_index == 3 || inf_index == sizeof(inf_string) - 1) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
return MATCHING_FAILURE;
}
}
static const char nan_string[] = "nan";
// Handle nan
if (internal::tolower(cur_char) == nan_string[0]) {
size_t nan_index = 0;
for (;
nan_index < (sizeof(nan_string) - 1) && out_str.length() < max_width &&
internal::tolower(cur_char) == nan_string[nan_index];
++nan_index) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
}
if (nan_index == sizeof(nan_string) - 1) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
return MATCHING_FAILURE;
}
}
// Assume base of 10 by default but check if it is actually base 16.
int base = 10;
// If the string starts with 0 it might be in hex.
if (cur_char == '0') {
is_number = true;
// Read the next character to check.
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
// If we've hit the end, then this is "0", which is valid.
if (out_str.length() == max_width) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
cur_char = reader->getc();
}
// If that next character is an 'x' then this is a hexadecimal number.
if (internal::tolower(cur_char) == 'x') {
base = 16;
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
// If we've hit the end here, we have "0x" which is a valid prefix to a
// floating point number, and will be evaluated to 0.
if (out_str.length() == max_width) {
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
} else {
cur_char = reader->getc();
}
}
}
const char exponent_mark = ((base == 10) ? 'e' : 'p');
bool after_decimal = false;
// The format for the remaining characters at this point is DD.DDe+/-DD for
// base 10 and XX.XXp+/-DD for base 16
// This handles the digits before and after the decimal point, but not the
// exponent.
while (out_str.length() < max_width) {
if (internal::isalnum(cur_char) &&
internal::b36_char_to_int(cur_char) < base) {
is_number = true;
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
} else if (cur_char == DECIMAL_POINT && !after_decimal) {
after_decimal = true;
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
} else {
break;
}
}
// Handle the exponent, which has an exponent mark, an optional sign, and
// decimal digits.
if (internal::tolower(cur_char) == exponent_mark) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
if (out_str.length() == max_width) {
// This is laid out in the standard as being a matching error (100e is not
// a valid float) but may conflict with existing implementations.
return MATCHING_FAILURE;
} else {
cur_char = reader->getc();
}
if (cur_char == '+' || cur_char == '-') {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
if (out_str.length() == max_width) {
return MATCHING_FAILURE;
} else {
cur_char = reader->getc();
}
}
// It is specified by the standard that "100er" is a matching failure since
// the longest prefix of a possibly valid floating-point number (which is
// "100e") is not a valid floating-point number. If there is an exponent
// mark then there must be a digit after it else the number is not valid.
// Some implementations will roll back two characters (to just "100") and
// accept that since the prefix is not valid, and some will interpret an
// exponent mark followed by no digits as an additional exponent of 0
// (accepting "100e" and returning 100.0). Both of these behaviors are wrong
// by the standard, but they may be used in real code, see Hyrum's law. This
// code follows the standard, but may be incompatible due to code expecting
// these bugs.
if (!internal::isdigit(cur_char)) {
return MATCHING_FAILURE;
}
while (internal::isdigit(cur_char) && out_str.length() < max_width) {
if (!out_str.append(cur_char)) {
return ALLOCATION_FAILURE;
}
cur_char = reader->getc();
}
}
// We always read one more character than will be used, so we have to put the
// last one back.
reader->ungetc(cur_char);
// If we haven't actually found any digits, this is a matching failure (this
// catches cases like "+.")
if (!is_number) {
return MATCHING_FAILURE;
}
write_float_with_length(out_str.c_str(), to_conv);
return READ_OK;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -1,230 +0,0 @@
//===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/stdio/scanf_core/int_converter.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter_utils.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
// This code is very similar to the code in __support/str_to_integer.h but is
// not quite the same. Here is the list of differences and why they exist:
// 1) This takes a reader and a format section instead of a char* and the base.
// This should be fairly self explanatory. While the char* could be adapted
// to a reader and the base could be calculated ahead of time, the
// semantics are slightly different, specifically a char* can be indexed
// freely (I can read str[2] and then str[0]) whereas a File (which the
// reader may contain) cannot.
// 2) Because this uses a Reader, this function can only unget once.
// This is relevant because scanf specifies it reads the "longest sequence
// of input characters which does not exceed any specified field width and
// which is, or is a prefix of, a matching input sequence." Whereas the
// strtol function accepts "the longest initial subsequence of the input
// string (...) that is of the expected form." This is demonstrated by the
// differences in how they deal with the string "0xZZZ" when parsing as
// hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
// since it reads the first 'Z', sees that it's not a valid hex digit, and
// reverses one character. The strtol function on the other hand only
// accepts the "0" since that's the longest valid hexadecimal sequence. It
// sees the 'Z' after the "0x" and determines that this is not the prefix
// to a valid hex string.
// 3) This conversion may have a maximum width.
// If a maximum width is specified, this conversion is only allowed to
// accept a certain number of characters. Strtol doesn't have any such
// limitation.
int convert_int(Reader *reader, const FormatSection &to_conv) {
// %d "Matches an optionally signed decimal integer [...] with the value 10
// for the base argument. The corresponding argument shall be a pointer to
// signed integer."
// %i "Matches an optionally signed integer [...] with the value 0 for the
// base argument. The corresponding argument shall be a pointer to signed
// integer."
// %u "Matches an optionally signed decimal integer [...] with the value 10
// for the base argument. The corresponding argument shall be a pointer to
// unsigned integer"
// %o "Matches an optionally signed octal integer [...] with the value 8 for
// the base argument. The corresponding argument shall be a pointer to
// unsigned integer"
// %x/X "Matches an optionally signed hexadecimal integer [...] with the value
// 16 for the base argument. The corresponding argument shall be a pointer to
// unsigned integer"
size_t max_width = cpp::numeric_limits<size_t>::max();
if (to_conv.max_width > 0) {
max_width = to_conv.max_width;
}
uintmax_t result = 0;
bool is_number = false;
bool is_signed = false;
int base = 0;
if (to_conv.conv_name == 'i') {
base = 0;
is_signed = true;
} else if (to_conv.conv_name == 'o') {
base = 8;
} else if (internal::tolower(to_conv.conv_name) == 'x' ||
to_conv.conv_name == 'p') {
base = 16;
} else if (to_conv.conv_name == 'd') {
base = 10;
is_signed = true;
} else { // conv_name must be 'u'
base = 10;
}
char cur_char = reader->getc();
char result_sign = '+';
if (cur_char == '+' || cur_char == '-') {
result_sign = cur_char;
if (max_width > 1) {
--max_width;
cur_char = reader->getc();
} else {
// If the max width has been hit already, then the return value must be 0
// since no actual digits of the number have been parsed yet.
write_int_with_length(0, to_conv);
return MATCHING_FAILURE;
}
}
const bool is_negative = result_sign == '-';
// Base of 0 means automatically determine the base. Base of 16 may have a
// prefix of "0x"
if (base == 0 || base == 16) {
// If the first character is 0, then it could be octal or hex.
if (cur_char == '0') {
is_number = true;
// Read the next character to check.
if (max_width > 1) {
--max_width;
cur_char = reader->getc();
} else {
write_int_with_length(0, to_conv);
return READ_OK;
}
if (internal::tolower(cur_char) == 'x') {
// This is a valid hex prefix.
is_number = false;
// A valid hex prefix is not necessarily a valid number. For the
// conversion to be valid it needs to use all of the characters it
// consumes. From the standard:
// 7.23.6.2 paragraph 9: "An input item is defined as the longest
// sequence of input characters which does not exceed any specified
// field width and which is, or is a prefix of, a matching input
// sequence."
// 7.23.6.2 paragraph 10: "If the input item is not a matching sequence,
// the execution of the directive fails: this condition is a matching
// failure"
base = 16;
if (max_width > 1) {
--max_width;
cur_char = reader->getc();
} else {
return MATCHING_FAILURE;
}
} else {
if (base == 0) {
base = 8;
}
}
} else if (base == 0) {
if (internal::isdigit(cur_char)) {
// If the first character is a different number, then it's 10.
base = 10;
} else {
// If the first character isn't a valid digit, then there are no valid
// digits at all. The number is 0.
reader->ungetc(cur_char);
write_int_with_length(0, to_conv);
return MATCHING_FAILURE;
}
}
}
constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
constexpr uintmax_t SIGNED_MAX =
static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
constexpr uintmax_t NEGATIVE_SIGNED_MAX =
static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;
const uintmax_t MAX =
(is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
: UNSIGNED_MAX);
const uintmax_t max_div_by_base = MAX / base;
if (internal::isalnum(cur_char) &&
internal::b36_char_to_int(cur_char) < base) {
is_number = true;
}
bool has_overflow = false;
size_t i = 0;
for (; i < max_width && internal::isalnum(cur_char) &&
internal::b36_char_to_int(cur_char) < base;
++i, cur_char = reader->getc()) {
uintmax_t cur_digit = internal::b36_char_to_int(cur_char);
if (result == MAX) {
has_overflow = true;
continue;
} else if (result > max_div_by_base) {
result = MAX;
has_overflow = true;
} else {
result = result * base;
}
if (result > MAX - cur_digit) {
result = MAX;
has_overflow = true;
} else {
result = result + cur_digit;
}
}
// We always read one more character than will be used, so we have to put the
// last one back.
reader->ungetc(cur_char);
if (!is_number)
return MATCHING_FAILURE;
if (has_overflow) {
write_int_with_length(MAX, to_conv);
} else {
if (is_negative)
result = -result;
write_int_with_length(result, to_conv);
}
return READ_OK;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -9,7 +9,10 @@
#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H
#include "src/__support/CPP/limits.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter_utils.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
@@ -18,7 +21,212 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert_int(Reader *reader, const FormatSection &to_conv);
// This code is very similar to the code in __support/str_to_integer.h but is
// not quite the same. Here is the list of differences and why they exist:
// 1) This takes a reader and a format section instead of a char* and the base.
// This should be fairly self explanatory. While the char* could be adapted
// to a reader and the base could be calculated ahead of time, the
// semantics are slightly different, specifically a char* can be indexed
// freely (I can read str[2] and then str[0]) whereas a File (which the
// reader may contain) cannot.
// 2) Because this uses a Reader, this function can only unget once.
// This is relevant because scanf specifies it reads the "longest sequence
// of input characters which does not exceed any specified field width and
// which is, or is a prefix of, a matching input sequence." Whereas the
// strtol function accepts "the longest initial subsequence of the input
// string (...) that is of the expected form." This is demonstrated by the
// differences in how they deal with the string "0xZZZ" when parsing as
// hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
// since it reads the first 'Z', sees that it's not a valid hex digit, and
// reverses one character. The strtol function on the other hand only
// accepts the "0" since that's the longest valid hexadecimal sequence. It
// sees the 'Z' after the "0x" and determines that this is not the prefix
// to a valid hex string.
// 3) This conversion may have a maximum width.
// If a maximum width is specified, this conversion is only allowed to
// accept a certain number of characters. Strtol doesn't have any such
// limitation.
template <typename T>
int convert_int(Reader<T> *reader, const FormatSection &to_conv) {
// %d "Matches an optionally signed decimal integer [...] with the value 10
// for the base argument. The corresponding argument shall be a pointer to
// signed integer."
// %i "Matches an optionally signed integer [...] with the value 0 for the
// base argument. The corresponding argument shall be a pointer to signed
// integer."
// %u "Matches an optionally signed decimal integer [...] with the value 10
// for the base argument. The corresponding argument shall be a pointer to
// unsigned integer"
// %o "Matches an optionally signed octal integer [...] with the value 8 for
// the base argument. The corresponding argument shall be a pointer to
// unsigned integer"
// %x/X "Matches an optionally signed hexadecimal integer [...] with the value
// 16 for the base argument. The corresponding argument shall be a pointer to
// unsigned integer"
size_t max_width = cpp::numeric_limits<size_t>::max();
if (to_conv.max_width > 0) {
max_width = to_conv.max_width;
}
uintmax_t result = 0;
bool is_number = false;
bool is_signed = false;
int base = 0;
if (to_conv.conv_name == 'i') {
base = 0;
is_signed = true;
} else if (to_conv.conv_name == 'o') {
base = 8;
} else if (internal::tolower(to_conv.conv_name) == 'x' ||
to_conv.conv_name == 'p') {
base = 16;
} else if (to_conv.conv_name == 'd') {
base = 10;
is_signed = true;
} else { // conv_name must be 'u'
base = 10;
}
char cur_char = reader->getc();
char result_sign = '+';
if (cur_char == '+' || cur_char == '-') {
result_sign = cur_char;
if (max_width > 1) {
--max_width;
cur_char = reader->getc();
} else {
// If the max width has been hit already, then the return value must be 0
// since no actual digits of the number have been parsed yet.
write_int_with_length(0, to_conv);
return MATCHING_FAILURE;
}
}
const bool is_negative = result_sign == '-';
// Base of 0 means automatically determine the base. Base of 16 may have a
// prefix of "0x"
if (base == 0 || base == 16) {
// If the first character is 0, then it could be octal or hex.
if (cur_char == '0') {
is_number = true;
// Read the next character to check.
if (max_width > 1) {
--max_width;
cur_char = reader->getc();
} else {
write_int_with_length(0, to_conv);
return READ_OK;
}
if (internal::tolower(cur_char) == 'x') {
// This is a valid hex prefix.
is_number = false;
// A valid hex prefix is not necessarily a valid number. For the
// conversion to be valid it needs to use all of the characters it
// consumes. From the standard:
// 7.23.6.2 paragraph 9: "An input item is defined as the longest
// sequence of input characters which does not exceed any specified
// field width and which is, or is a prefix of, a matching input
// sequence."
// 7.23.6.2 paragraph 10: "If the input item is not a matching sequence,
// the execution of the directive fails: this condition is a matching
// failure"
base = 16;
if (max_width > 1) {
--max_width;
cur_char = reader->getc();
} else {
return MATCHING_FAILURE;
}
} else {
if (base == 0) {
base = 8;
}
}
} else if (base == 0) {
if (internal::isdigit(cur_char)) {
// If the first character is a different number, then it's 10.
base = 10;
} else {
// If the first character isn't a valid digit, then there are no valid
// digits at all. The number is 0.
reader->ungetc(cur_char);
write_int_with_length(0, to_conv);
return MATCHING_FAILURE;
}
}
}
constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
constexpr uintmax_t SIGNED_MAX =
static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
constexpr uintmax_t NEGATIVE_SIGNED_MAX =
static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;
const uintmax_t MAX =
(is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
: UNSIGNED_MAX);
const uintmax_t max_div_by_base = MAX / base;
if (internal::isalnum(cur_char) &&
internal::b36_char_to_int(cur_char) < base) {
is_number = true;
}
bool has_overflow = false;
size_t i = 0;
for (; i < max_width && internal::isalnum(cur_char) &&
internal::b36_char_to_int(cur_char) < base;
++i, cur_char = reader->getc()) {
uintmax_t cur_digit = internal::b36_char_to_int(cur_char);
if (result == MAX) {
has_overflow = true;
continue;
} else if (result > max_div_by_base) {
result = MAX;
has_overflow = true;
} else {
result = result * base;
}
if (result > MAX - cur_digit) {
result = MAX;
has_overflow = true;
} else {
result = result + cur_digit;
}
}
// We always read one more character than will be used, so we have to put the
// last one back.
reader->ungetc(cur_char);
if (!is_number)
return MATCHING_FAILURE;
if (has_overflow) {
write_int_with_length(MAX, to_conv);
} else {
if (is_negative)
result = -result;
write_int_with_length(result, to_conv);
}
return READ_OK;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -1,46 +0,0 @@
//===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/stdio/scanf_core/ptr_converter.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter_utils.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/int_converter.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert_pointer(Reader *reader, const FormatSection &to_conv) {
static const char nullptr_string[] = "(nullptr)";
// Check if it's exactly the nullptr string, if so then it's a nullptr.
char cur_char = reader->getc();
size_t i = 0;
for (; i < (sizeof(nullptr_string) - 1) &&
internal::tolower(cur_char) == nullptr_string[i];
++i) {
cur_char = reader->getc();
}
if (i == (sizeof(nullptr_string) - 1)) {
*reinterpret_cast<void **>(to_conv.output_ptr) = nullptr;
return READ_OK;
} else if (i > 0) {
return MATCHING_FAILURE;
}
reader->ungetc(cur_char);
// Else treat it as a hex int
return convert_int(reader, to_conv);
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -9,8 +9,10 @@
#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PTR_CONVERTER_H
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PTR_CONVERTER_H
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/int_converter.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
@@ -18,7 +20,30 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert_pointer(Reader *reader, const FormatSection &to_conv);
template <typename T>
int convert_pointer(Reader<T> *reader, const FormatSection &to_conv) {
static const char nullptr_string[] = "(nullptr)";
// Check if it's exactly the nullptr string, if so then it's a nullptr.
char cur_char = reader->getc();
size_t i = 0;
for (; i < (sizeof(nullptr_string) - 1) &&
internal::tolower(cur_char) == nullptr_string[i];
++i) {
cur_char = reader->getc();
}
if (i == (sizeof(nullptr_string) - 1)) {
*reinterpret_cast<void **>(to_conv.output_ptr) = nullptr;
return READ_OK;
} else if (i > 0) {
return MATCHING_FAILURE;
}
reader->ungetc(cur_char);
// Else treat it as a hex int
return convert_int(reader, to_conv);
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -9,17 +9,6 @@
#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_READER_H
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_READER_H
#include "hdr/types/FILE.h"
#ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE
#include "src/__support/File/file.h"
#endif
#if defined(LIBC_TARGET_ARCH_IS_GPU)
#include "src/stdio/getc.h"
#include "src/stdio/ungetc.h"
#endif
#include "src/__support/macros/attributes.h" // For LIBC_INLINE
#include "src/__support/macros/config.h"
@@ -27,103 +16,24 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
// We use the name "reader_internal" over "internal" because
// "internal" causes name lookups in files that include the current header to be
// ambigious i.e. `internal::foo` in those files, will try to lookup in
// `LIBC_NAMESPACE::scanf_core::internal` over `LIBC_NAMESPACE::internal` for
// e.g., `internal::ArgList` in `libc/src/stdio/scanf_core/scanf_main.h`
namespace reader_internal {
#if defined(LIBC_TARGET_ARCH_IS_GPU)
// The GPU build provides FILE access through the host operating system's
// library. So here we simply use the public entrypoints like in the SYSTEM_FILE
// interface. Entrypoints should normally not call others, this is an exception.
// FIXME: We do not acquire any locks here, so this is not thread safe.
LIBC_INLINE int getc(void *f) {
return LIBC_NAMESPACE::getc(reinterpret_cast<::FILE *>(f));
}
LIBC_INLINE void ungetc(int c, void *f) {
LIBC_NAMESPACE::ungetc(c, reinterpret_cast<::FILE *>(f));
}
#elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
LIBC_INLINE int getc(void *f) {
unsigned char c;
auto result =
reinterpret_cast<LIBC_NAMESPACE::File *>(f)->read_unlocked(&c, 1);
size_t r = result.value;
if (result.has_error() || r != 1)
return '\0';
return c;
}
LIBC_INLINE void ungetc(int c, void *f) {
reinterpret_cast<LIBC_NAMESPACE::File *>(f)->ungetc_unlocked(c);
}
#else // defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
// Since ungetc_unlocked isn't always available, we don't acquire the lock for
// system files.
LIBC_INLINE int getc(void *f) { return ::getc(reinterpret_cast<::FILE *>(f)); }
LIBC_INLINE void ungetc(int c, void *f) {
::ungetc(c, reinterpret_cast<::FILE *>(f));
}
#endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
} // namespace reader_internal
// This is intended to be either a raw string or a buffer syncronized with the
// file's internal buffer.
struct ReadBuffer {
const char *buffer;
size_t buff_len;
size_t buff_cur = 0;
};
class Reader {
ReadBuffer *rb;
void *input_stream = nullptr;
template <typename Derived> class Reader {
size_t cur_chars_read = 0;
public:
// TODO: Set buff_len with a proper constant
LIBC_INLINE Reader(ReadBuffer *string_buffer) : rb(string_buffer) {}
LIBC_INLINE Reader(void *stream, ReadBuffer *stream_buffer = nullptr)
: rb(stream_buffer), input_stream(stream) {}
// This returns the next character from the input and advances it by one
// character. When it hits the end of the string or file it returns '\0' to
// signal to stop parsing.
LIBC_INLINE char getc() {
++cur_chars_read;
if (rb != nullptr) {
char output = rb->buffer[rb->buff_cur];
++(rb->buff_cur);
return output;
}
// This should reset the buffer if applicable.
return static_cast<char>(reader_internal::getc(input_stream));
return static_cast<Derived *>(this)->getc();
}
// This moves the input back by one character, placing c into the buffer if
// this is a file reader, else c is ignored.
LIBC_INLINE void ungetc(char c) {
LIBC_INLINE void ungetc(int c) {
--cur_chars_read;
if (rb != nullptr && rb->buff_cur > 0) {
// While technically c should be written back to the buffer, in scanf we
// always write the character that was already there. Additionally, the
// buffer is most likely to contain a string that isn't part of a file,
// which may not be writable.
--(rb->buff_cur);
return;
}
reader_internal::ungetc(static_cast<int>(c), input_stream);
static_cast<Derived *>(this)->ungetc(c);
}
LIBC_INLINE size_t chars_read() { return cur_chars_read; }

View File

@@ -1,46 +0,0 @@
//===-- Starting point for scanf --------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/stdio/scanf_core/scanf_main.h"
#include "src/__support/arg_list.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/parser.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int scanf_main(Reader *reader, const char *__restrict str,
internal::ArgList &args) {
Parser<internal::ArgList> parser(str, args);
int ret_val = READ_OK;
int conversions = 0;
for (FormatSection cur_section = parser.get_next_section();
!cur_section.raw_string.empty() && ret_val == READ_OK;
cur_section = parser.get_next_section()) {
if (cur_section.has_conv) {
ret_val = convert(reader, cur_section);
// The %n (current position) conversion doesn't increment the number of
// assignments.
if (cur_section.conv_name != 'n')
conversions += ret_val == READ_OK ? 1 : 0;
} else {
ret_val = raw_match(reader, cur_section.raw_string);
}
}
return conversions;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -11,6 +11,9 @@
#include "src/__support/arg_list.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/converter.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/parser.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
@@ -18,8 +21,28 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int scanf_main(Reader *reader, const char *__restrict str,
internal::ArgList &args);
template <typename T>
int scanf_main(Reader<T> *reader, const char *__restrict str,
internal::ArgList &args) {
Parser<internal::ArgList> parser(str, args);
int ret_val = READ_OK;
int conversions = 0;
for (FormatSection cur_section = parser.get_next_section();
!cur_section.raw_string.empty() && ret_val == READ_OK;
cur_section = parser.get_next_section()) {
if (cur_section.has_conv) {
ret_val = convert(reader, cur_section);
// The %n (current position) conversion doesn't increment the number of
// assignments.
if (cur_section.conv_name != 'n')
conversions += ret_val == READ_OK ? 1 : 0;
} else {
ret_val = raw_match(reader, cur_section.raw_string);
}
}
return conversions;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -1,77 +0,0 @@
//===-- String type specifier converters for scanf --------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/stdio/scanf_core/string_converter.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert_string(Reader *reader, const FormatSection &to_conv) {
// %s "Matches a sequence of non-white-space characters"
// %c "Matches a sequence of characters of exactly the number specified by the
// field width (1 if no field width is present in the directive)"
// %[ "Matches a nonempty sequence of characters from a set of expected
// characters (the scanset)."
size_t max_width = 0;
if (to_conv.max_width > 0) {
max_width = to_conv.max_width;
} else {
if (to_conv.conv_name == 'c') {
max_width = 1;
} else {
max_width = cpp::numeric_limits<size_t>::max();
}
}
char *output = reinterpret_cast<char *>(to_conv.output_ptr);
char cur_char = reader->getc();
size_t i = 0;
for (; i < max_width && cur_char != '\0'; ++i) {
// If this is %s and we've hit a space, or if this is %[] and we've found
// something not in the scanset.
if ((to_conv.conv_name == 's' && internal::isspace(cur_char)) ||
(to_conv.conv_name == '[' && !to_conv.scan_set.test(cur_char))) {
break;
}
// if the NO_WRITE flag is not set, write to the output.
if ((to_conv.flags & NO_WRITE) == 0)
output[i] = cur_char;
cur_char = reader->getc();
}
// We always read one more character than will be used, so we have to put the
// last one back.
reader->ungetc(cur_char);
// If this is %s or %[]
if (to_conv.conv_name != 'c' && (to_conv.flags & NO_WRITE) == 0) {
// Always null terminate the string. This may cause a write to the
// (max_width + 1) byte, which is correct. The max width describes the max
// number of characters read from the input string, and doesn't necessarily
// correspond to the output.
output[i] = '\0';
}
if (i == 0)
return MATCHING_FAILURE;
return READ_OK;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -9,6 +9,8 @@
#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_CONVERTER_H
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_CONVERTER_H
#include "src/__support/CPP/limits.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
@@ -18,7 +20,60 @@
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
int convert_string(Reader *reader, const FormatSection &to_conv);
template <typename T>
int convert_string(Reader<T> *reader, const FormatSection &to_conv) {
// %s "Matches a sequence of non-white-space characters"
// %c "Matches a sequence of characters of exactly the number specified by the
// field width (1 if no field width is present in the directive)"
// %[ "Matches a nonempty sequence of characters from a set of expected
// characters (the scanset)."
size_t max_width = 0;
if (to_conv.max_width > 0) {
max_width = to_conv.max_width;
} else {
if (to_conv.conv_name == 'c') {
max_width = 1;
} else {
max_width = cpp::numeric_limits<size_t>::max();
}
}
char *output = reinterpret_cast<char *>(to_conv.output_ptr);
char cur_char = reader->getc();
size_t i = 0;
for (; i < max_width && cur_char != '\0'; ++i) {
// If this is %s and we've hit a space, or if this is %[] and we've found
// something not in the scanset.
if ((to_conv.conv_name == 's' && internal::isspace(cur_char)) ||
(to_conv.conv_name == '[' && !to_conv.scan_set.test(cur_char))) {
break;
}
// if the NO_WRITE flag is not set, write to the output.
if ((to_conv.flags & NO_WRITE) == 0)
output[i] = cur_char;
cur_char = reader->getc();
}
// We always read one more character than will be used, so we have to put the
// last one back.
reader->ungetc(cur_char);
// If this is %s or %[]
if (to_conv.conv_name != 'c' && (to_conv.flags & NO_WRITE) == 0) {
// Always null terminate the string. This may cause a write to the
// (max_width + 1) byte, which is correct. The max width describes the max
// number of characters read from the input string, and doesn't necessarily
// correspond to the output.
output[i] = '\0';
}
if (i == 0)
return MATCHING_FAILURE;
return READ_OK;
}
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -0,0 +1,49 @@
//===-- Reader definition for scanf -----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_READER_H
#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_READER_H
#include "src/__support/macros/attributes.h" // For LIBC_INLINE
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/reader.h"
#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace scanf_core {
class StringReader : public Reader<StringReader> {
const char *buffer;
[[maybe_unused]] size_t buff_len;
size_t buff_cur = 0;
public:
LIBC_INLINE StringReader(const char *buffer, size_t buff_len)
: buffer(buffer), buff_len(buff_len) {}
LIBC_INLINE char getc() {
char output = buffer[buff_cur];
++buff_cur;
return output;
}
LIBC_INLINE void ungetc(int) {
if (buff_cur > 0) {
// While technically c should be written back to the buffer, in scanf we
// always write the character that was already there. Additionally, the
// buffer is most likely to contain a string that isn't part of a file,
// which may not be writable.
--buff_cur;
}
}
};
} // namespace scanf_core
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_STRING_READER_H

View File

@@ -38,6 +38,10 @@ LIBC_INLINE void funlockfile(::FILE *) { return; }
LIBC_INLINE int ferror_unlocked(::FILE *f) { return LIBC_NAMESPACE::ferror(f); }
LIBC_INLINE int getc(::FILE *f) { return LIBC_NAMESPACE::getc(f); }
LIBC_INLINE void ungetc(int c, ::FILE *f) { LIBC_NAMESPACE::ungetc(c, f); }
#elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
LIBC_INLINE void flockfile(FILE *f) {
@@ -52,6 +56,21 @@ LIBC_INLINE int ferror_unlocked(FILE *f) {
return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->error_unlocked();
}
LIBC_INLINE int getc(FILE *f) {
unsigned char c;
auto result =
reinterpret_cast<LIBC_NAMESPACE::File *>(f)->read_unlocked(&c, 1);
size_t r = result.value;
if (result.has_error() || r != 1)
return '\0';
return c;
}
LIBC_INLINE void ungetc(int c, FILE *f) {
reinterpret_cast<LIBC_NAMESPACE::File *>(f)->ungetc_unlocked(c);
}
#else // defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
// Since ungetc_unlocked isn't always available, we don't acquire the lock for
@@ -62,17 +81,35 @@ LIBC_INLINE void funlockfile(::FILE *) { return; }
LIBC_INLINE int ferror_unlocked(::FILE *f) { return ::ferror(f); }
LIBC_INLINE int getc(::FILE *f) { return ::getc(f); }
LIBC_INLINE void ungetc(int c, ::FILE *f) { ::ungetc(c, f); }
#endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
} // namespace internal
namespace scanf_core {
class StreamReader : public Reader<StreamReader> {
::FILE *stream;
public:
LIBC_INLINE StreamReader(::FILE *stream) : stream(stream) {}
LIBC_INLINE char getc() {
return static_cast<char>(internal::getc(static_cast<FILE *>(stream)));
}
LIBC_INLINE void ungetc(int c) {
internal::ungetc(c, static_cast<FILE *>(stream));
}
};
LIBC_INLINE int vfscanf_internal(::FILE *__restrict stream,
const char *__restrict format,
internal::ArgList &args) {
internal::flockfile(stream);
scanf_core::Reader reader(stream);
scanf_core::StreamReader reader(stream);
int retval = scanf_core::scanf_main(&reader, format, args);
if (retval == 0 && internal::ferror_unlocked(stream))
retval = EOF;

View File

@@ -11,8 +11,8 @@
#include "src/__support/CPP/limits.h"
#include "src/__support/arg_list.h"
#include "src/__support/macros/config.h"
#include "src/stdio/scanf_core/reader.h"
#include "src/stdio/scanf_core/scanf_main.h"
#include "src/stdio/scanf_core/string_reader.h"
#include "hdr/stdio_macros.h"
#include "hdr/types/FILE.h"
@@ -29,8 +29,7 @@ LLVM_LIBC_FUNCTION(int, sscanf,
// and pointer semantics, as well as handling
// destruction automatically.
va_end(vlist);
scanf_core::ReadBuffer rb{buffer, cpp::numeric_limits<size_t>::max()};
scanf_core::Reader reader(&rb);
scanf_core::StringReader reader(buffer, cpp::numeric_limits<size_t>::max());
int ret_val = scanf_core::scanf_main(&reader, format, args);
// This is done to avoid including stdio.h in the internals. On most systems
// EOF is -1, so this will be transformed into just "return ret_val".

View File

@@ -11,8 +11,8 @@
#include "hdr/stdio_macros.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/arg_list.h"
#include "src/stdio/scanf_core/reader.h"
#include "src/stdio/scanf_core/scanf_main.h"
#include "src/stdio/scanf_core/string_reader.h"
#include <stdarg.h>
@@ -21,9 +21,7 @@ namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(int, vsscanf,
(const char *buffer, const char *format, va_list vlist)) {
internal::ArgList args(vlist);
scanf_core::ReadBuffer rb{const_cast<char *>(buffer),
cpp::numeric_limits<size_t>::max()};
scanf_core::Reader reader(&rb);
scanf_core::StringReader reader(buffer, cpp::numeric_limits<size_t>::max());
int ret_val = scanf_core::scanf_main(&reader, format, args);
// This is done to avoid including stdio.h in the internals. On most systems
// EOF is -1, so this will be transformed into just "return ret_val".

View File

@@ -32,7 +32,7 @@ add_libc_unittest(
SRCS
reader_test.cpp
DEPENDS
libc.src.stdio.scanf_core.reader
libc.src.stdio.scanf_core.string_reader
libc.src.__support.CPP.string_view
COMPILE_OPTIONS
${use_system_file}
@@ -45,8 +45,8 @@ add_libc_unittest(
SRCS
converter_test.cpp
DEPENDS
libc.src.stdio.scanf_core.reader
libc.src.stdio.scanf_core.converter
libc.src.stdio.scanf_core.string_reader
libc.src.__support.CPP.string_view
COMPILE_OPTIONS
${use_system_file}

View File

@@ -9,14 +9,13 @@
#include "src/__support/CPP/string_view.h"
#include "src/stdio/scanf_core/converter.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"
#include "src/stdio/scanf_core/string_reader.h"
#include "test/UnitTest/Test.h"
TEST(LlvmLibcScanfConverterTest, RawMatchBasic) {
const char *str = "abcdef";
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
// Reading "abc" should succeed.
ASSERT_EQ(LIBC_NAMESPACE::scanf_core::raw_match(&reader, "abc"),
@@ -51,8 +50,7 @@ TEST(LlvmLibcScanfConverterTest, RawMatchBasic) {
TEST(LlvmLibcScanfConverterTest, RawMatchSpaces) {
const char *str = " a \t\n b cd";
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
// Reading "a" should fail and not advance.
// Since there's nothing in the format string (the second argument to
@@ -98,8 +96,7 @@ TEST(LlvmLibcScanfConverterTest, RawMatchSpaces) {
TEST(LlvmLibcScanfConverterTest, StringConvSimple) {
const char *str = "abcDEF123 654LKJihg";
char result[20];
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
LIBC_NAMESPACE::scanf_core::FormatSection conv;
conv.has_conv = true;
@@ -120,8 +117,7 @@ TEST(LlvmLibcScanfConverterTest, StringConvSimple) {
TEST(LlvmLibcScanfConverterTest, StringConvNoWrite) {
const char *str = "abcDEF123 654LKJihg";
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
LIBC_NAMESPACE::scanf_core::FormatSection conv;
conv.has_conv = true;
@@ -141,8 +137,7 @@ TEST(LlvmLibcScanfConverterTest, StringConvNoWrite) {
TEST(LlvmLibcScanfConverterTest, StringConvWidth) {
const char *str = "abcDEF123 654LKJihg";
char result[6];
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
LIBC_NAMESPACE::scanf_core::FormatSection conv;
conv.has_conv = true;
@@ -175,8 +170,7 @@ TEST(LlvmLibcScanfConverterTest, StringConvWidth) {
TEST(LlvmLibcScanfConverterTest, CharsConv) {
const char *str = "abcDEF123 654LKJihg MNOpqr&*(";
char result[20];
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
LIBC_NAMESPACE::scanf_core::FormatSection conv;
conv.has_conv = true;
@@ -230,8 +224,7 @@ TEST(LlvmLibcScanfConverterTest, CharsConv) {
TEST(LlvmLibcScanfConverterTest, ScansetConv) {
const char *str = "abcDEF[123] 654LKJihg";
char result[20];
LIBC_NAMESPACE::scanf_core::ReadBuffer str_reader{str, sizeof(str)};
LIBC_NAMESPACE::scanf_core::Reader reader(&str_reader);
LIBC_NAMESPACE::scanf_core::StringReader reader(str, sizeof(str));
LIBC_NAMESPACE::scanf_core::FormatSection conv;
conv.has_conv = true;

View File

@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
#include "src/__support/CPP/string_view.h"
#include "src/stdio/scanf_core/reader.h"
#include "src/stdio/scanf_core/string_reader.h"
#include "test/UnitTest/Test.h"
@@ -15,14 +15,14 @@ TEST(LlvmLibcScanfStringReaderTest, Constructor) {
char str[10];
// buff_len justneeds to be a big number. The specific value isn't important
// in the real world.
LIBC_NAMESPACE::scanf_core::ReadBuffer rb{const_cast<char *>(str), 1000000};
LIBC_NAMESPACE::scanf_core::Reader reader(&rb);
LIBC_NAMESPACE::scanf_core::StringReader reader(const_cast<char *>(str),
1000000);
}
TEST(LlvmLibcScanfStringReaderTest, SimpleRead) {
const char *str = "abc";
LIBC_NAMESPACE::scanf_core::ReadBuffer rb{const_cast<char *>(str), 1000000};
LIBC_NAMESPACE::scanf_core::Reader reader(&rb);
LIBC_NAMESPACE::scanf_core::StringReader reader(const_cast<char *>(str),
1000000);
for (size_t i = 0; i < sizeof("abc"); ++i) {
ASSERT_EQ(str[i], reader.getc());
@@ -31,8 +31,8 @@ TEST(LlvmLibcScanfStringReaderTest, SimpleRead) {
TEST(LlvmLibcScanfStringReaderTest, ReadAndReverse) {
const char *str = "abcDEF123";
LIBC_NAMESPACE::scanf_core::ReadBuffer rb{const_cast<char *>(str), 1000000};
LIBC_NAMESPACE::scanf_core::Reader reader(&rb);
LIBC_NAMESPACE::scanf_core::StringReader reader(const_cast<char *>(str),
1000000);
for (size_t i = 0; i < 5; ++i) {
ASSERT_EQ(str[i], reader.getc());