Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new utf8proc_codepoint_units function #172

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ test/valid
test/iterate
test/case
test/custom
test/sizeofchar
/tmp/
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,13 @@ test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@

test/sizeofchar: test/sizeofchar.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/sizeofchar.c test/tests.o utf8proc.o -o $@

test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/sizeofchar test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
Expand All @@ -169,3 +172,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
test/iterate
test/case
test/custom
test/sizeofchar
41 changes: 41 additions & 0 deletions test/sizeofchar.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "tests.h"
#include <ctype.h>
#include <wchar.h>

int main(int argc, char **argv)
{
int c, error = 0;

(void) argc; /* unused */
(void) argv; /* unused */

/* some simple sanity tests of */
for (c = 0; c < 0x80; c++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to check consistency with utf8proc_encode_char, i.e. check utf8proc_sizeof_char(c) == utf8proc_encode_char(c, buf) where you declare utf8proc_uint8_t buf[4].

if (utf8proc_sizeof_char(c) != 1) {
fprintf(stderr, "Failed: sizeof_char(%04x) != 1\n", c);
error++;
}
}
for (;c < 0x800; c++) {
if (utf8proc_sizeof_char(c) != 2) {
fprintf(stderr, "Failed: sizeof_char(%04x) != 2\n", c);
error++;
}
}
for (;c < 0x10000; c++) {
if (utf8proc_sizeof_char(c) != 3) {
fprintf(stderr, "Failed: sizeof_char(%06x) != 3\n", c);
error++;
}
}
for (;c < 0x110000; c++) {
if (utf8proc_sizeof_char(c) != 4) {
fprintf(stderr, "Failed: sizeof_char(%06x) != 4\n", c);
error++;
}
}
check(!error, "utf8proc_sizeof_char FAILED %d tests.", error);
printf("Validity tests SUCCEEDED.\n");

return 0;
}
13 changes: 13 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,19 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
}

UTF8PROC_DLLEXPORT int utf8proc_sizeof_char(utf8proc_int32_t uc)
{
if (uc < 0x80) {
return 1;
} else if (uc < 0x800) {
return 2;
} else if (uc < 0x10000) {
return 3;
} else if (uc < 0x110000) {
return 4;
} else return 0;
}

UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
if (uc < 0x00) {
return 0;
Expand Down
7 changes: 7 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);

/**
* Returns the number of UTF-8 bytes required for the given codepoint.
*
* This function does not check whether `codepoint` is valid Unicode.
*/
UTF8PROC_DLLEXPORT int utf8proc_sizeof_char(utf8proc_int32_t codepoint);

/**
* Encodes the codepoint as an UTF-8 string in the byte array pointed
* to by `dst`. This array must be at least 4 bytes long.
Expand Down