Skip to content

Commit

Permalink
adding an identifier function
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Dec 18, 2024
1 parent bf2aa3a commit edf2174
Show file tree
Hide file tree
Showing 7 changed files with 665 additions and 6 deletions.
17 changes: 17 additions & 0 deletions include/ada/idna/identifier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#ifndef ADA_IDNA_IDENTIFIER_H
#define ADA_IDNA_IDENTIFIER_H

#include <string>
#include <string_view>

namespace ada::idna {

// Access the first code point of the input string.
// Verify if it is valid name code point given a Unicode code point and a boolean first:
// If first is true return the result of checking if code point is contained in the IdentifierStart set of code points.
// Otherwise return the result of checking if code point is contained in the IdentifierPart set of code points.
bool valid_name_code_point(std::string_view input, bool first);

} // namespace ada::idna

#endif
1 change: 1 addition & 0 deletions include/idna.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
#include "ada/idna/validity.h"
#include "ada/idna/to_ascii.h"
#include "ada/idna/to_unicode.h"
#include "ada/idna/identifier.h"

#endif
14 changes: 8 additions & 6 deletions scripts/idna_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def get_table():
with open(filename, 'r') as file:
return file.read()


def get_version(table_data):
return re.search("# Version: (.*)", table_data).group(1)

Expand All @@ -81,7 +80,7 @@ def values(words):
return answer

UseSTD3ASCIIRules=False
if __name__ == "__main__":
def print_idna():
table_data = get_table()
print("// IDNA ", get_version(get_table()))
words = []
Expand Down Expand Up @@ -121,13 +120,16 @@ def values(words):
#include <cstdint>
namespace ada::idna {
""")
print("const uint32_t mappings["+str(len(long_mapped))+"] = ")
""")
print("const uint32_t mappings["+str(len(long_mapped))+"] =")
print(multiline_cpp_array_initializer(long_mapped), end=";\n")
print("const uint32_t table["+str(len(words))+"][2] = ")
print("const uint32_t table["+str(len(words))+"][2] =")
print(cpp_arrayarray_initializer(values(words)), end=";\n")
print("""
} // namespace ada::idna
#endif // ADA_IDNA_TABLES_H
""")
""")

if __name__ == "__main__":
print_idna()
541 changes: 541 additions & 0 deletions src/id_tables.cpp

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions src/identifier.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include "ada/idna/identifier.h"
#include <_types/_uint32_t.h>

#include <algorithm>
#include <array>
#include <string>

#include "id_tables.cpp"

namespace ada::idna {
// return 0xffffffff in case of error
// We do not fully validate the input
uint32_t get_first_code_point(std::string_view input) {
constexpr uint32_t error = 0xffffffff;
// Check if the input is empty
if (input.empty()) { return error; }

uint32_t code_point = 0;
size_t number_bytes = 0;
unsigned char first_byte = input[0];

if ((first_byte & 0x80) == 0) {
// 1-byte character (ASCII)
return first_byte;
} else if ((first_byte & 0xE0) == 0xC0) {
// 2-byte character
code_point = first_byte & 0x1F;
number_bytes = 2;
} else if ((first_byte & 0xF0) == 0xE0) {
// 3-byte character
code_point = first_byte & 0x0F;
number_bytes = 3;
} else if ((first_byte & 0xF8) == 0xF0) {
// 4-byte character
code_point = first_byte & 0x07;
number_bytes = 4;
} else {
return error;
}

// Decode the remaining bytes
for (size_t i = 1; i < number_bytes; ++i) {
if (i >= input.size()) {
return error;
}
unsigned char byte = input[i];
if ((byte & 0xC0) != 0x80) {
return error;
}
code_point = (code_point << 6) | (byte & 0x3F);
}
return code_point;
}

bool is_ascii_letter(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

bool is_ascii_letter_or_digit(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
}

bool valid_name_code_point(std::string_view input, bool first) {
// https://urlpattern.spec.whatwg.org/#is-a-valid-name-code-point
if(input.empty()) {
return false;
}
// https://tc39.es/ecma262/#prod-IdentifierStart
// Fast paths:
if(first && (input[0] == '$' || input[0] == '_' || is_ascii_letter(input[0]))) {
return true;
}
if(!first && (input[0] == '$' || is_ascii_letter_or_digit(input[0]))) {
return true;
}
// Slow path...
uint32_t code_point = get_first_code_point(input);
if(code_point == 0xffffffff) {
return false; // minimal error handling
}
if(first) {
auto iter = std::lower_bound(std::begin( ada::idna::id_start), std::end(ada::idna::id_start), code_point,
[](const uint32_t* range, uint32_t code_point) {
return range[1] < code_point;
});
return iter != std::end(id_start) && code_point >= (*iter)[0];
} else {
auto iter = std::lower_bound(std::begin(id_continue), std::end(id_continue), code_point,
[](const uint32_t* range, uint32_t code_point) {
return range[1] < code_point;
});
return iter != std::end(id_start) && code_point >= (*iter)[0];
}

}
} // namespace ada::idna
1 change: 1 addition & 0 deletions src/idna.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
#include "validity.cpp"
#include "to_ascii.cpp"
#include "to_unicode.cpp"
#include "identifier.cpp"
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ link_libraries(ada-idna)
add_cpp_test(punycode_tests)
add_cpp_test(to_ascii_tests)
add_cpp_test(mapping_tests)
add_cpp_test(identifier_tests)
add_cpp_test(to_unicode_tests)
add_cpp_test(wpt_tests)

Expand Down

0 comments on commit edf2174

Please sign in to comment.