Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CPLRecode(): make ISO-8859-2 and -15 and CP437/CP1250/CP1251/CP1252 to UTF-8 always available #10799

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 50 additions & 5 deletions autotest/cpp/test_cpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2846,22 +2846,23 @@ TEST_F(test_cpl, CPLJSONDocument)
}

// Test CPLRecodeIconv() with re-allocation
// (this test also passed on Windows using its native recoding API)
TEST_F(test_cpl, CPLRecodeIconv)
{
#ifdef CPL_RECODE_ICONV
#if defined(CPL_RECODE_ICONV) || defined(_WIN32)
int N = 32800;
char *pszIn = static_cast<char *>(CPLMalloc(N + 1));
for (int i = 0; i < N; i++)
pszIn[i] = '\xE9';
pszIn[i] = '\xA1';
pszIn[N] = 0;
char *pszExpected = static_cast<char *>(CPLMalloc(N * 2 + 1));
for (int i = 0; i < N; i++)
{
pszExpected[2 * i] = '\xC3';
pszExpected[2 * i + 1] = '\xA9';
pszExpected[2 * i] = '\xD0';
pszExpected[2 * i + 1] = '\x81';
}
pszExpected[N * 2] = 0;
char *pszRet = CPLRecode(pszIn, "ISO-8859-2", CPL_ENC_UTF8);
char *pszRet = CPLRecode(pszIn, "ISO-8859-5", CPL_ENC_UTF8);
EXPECT_EQ(memcmp(pszExpected, pszRet, N * 2 + 1), 0);
CPLFree(pszIn);
CPLFree(pszRet);
Expand All @@ -2871,6 +2872,50 @@ TEST_F(test_cpl, CPLRecodeIconv)
#endif
}

// Test CP1252 to UTF-8
TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_strict_alloc)
{
CPLClearRecodeWarningFlags();
CPLErrorReset();
CPLPushErrorHandler(CPLQuietErrorHandler);
// Euro character expands to 3-bytes
char *pszRet = CPLRecode("\x80", "CP1252", CPL_ENC_UTF8);
CPLPopErrorHandler();
EXPECT_STREQ(CPLGetLastErrorMsg(), "");
EXPECT_EQ(memcmp(pszRet, "\xE2\x82\xAC\x00", 4), 0);
CPLFree(pszRet);
}

// Test CP1252 to UTF-8
TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_with_ascii)
{
CPLClearRecodeWarningFlags();
CPLErrorReset();
CPLPushErrorHandler(CPLQuietErrorHandler);
char *pszRet = CPLRecode("x\x80y", "CP1252", CPL_ENC_UTF8);
CPLPopErrorHandler();
EXPECT_STREQ(CPLGetLastErrorMsg(), "");
EXPECT_EQ(memcmp(pszRet, "x\xE2\x82\xACy\x00", 6), 0);
CPLFree(pszRet);
}

// Test CP1252 to UTF-8
TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_with_warning)
{
CPLClearRecodeWarningFlags();
CPLErrorReset();
CPLPushErrorHandler(CPLQuietErrorHandler);
// \x90 is an invalid CP1252 character. Will be skipped
char *pszRet = CPLRecode("\x90\x80", "CP1252", CPL_ENC_UTF8);
CPLPopErrorHandler();
EXPECT_STREQ(
CPLGetLastErrorMsg(),
"One or several characters couldn't be converted correctly from CP1252 "
"to UTF-8. This warning will not be emitted anymore");
EXPECT_EQ(memcmp(pszRet, "\xE2\x82\xAC\x00", 4), 0);
CPLFree(pszRet);
}

// Test CPLHTTPParseMultipartMime()
TEST_F(test_cpl, CPLHTTPParseMultipartMime)
{
Expand Down
9 changes: 9 additions & 0 deletions port/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,12 @@ if (NOT CMAKE_CROSSCOMPILING AND BUILD_VSIPRELOAD AND "${CMAKE_SYSTEM}" MATCHES
endforeach()
endif ()
endif ()

# Utility to generate cpl_character_sets.h and .c
add_executable(character_set_conv_table_generator EXCLUDE_FROM_ALL character_set_conv_table_generator.c)

# Custom target that must be manually invoked if character_set_conv_table_generator.c is modified
add_custom_target(generate_cpl_character_sets
COMMAND $<TARGET_FILE:character_set_conv_table_generator>
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
DEPENDS character_set_conv_table_generator)
124 changes: 124 additions & 0 deletions port/character_set_conv_table_generator.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// SPDX-License-Identifier: MIT
// Copyright 2024 Even Rouault

#include <assert.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>

#define ENCODING_MAX_LEN 256

static void launder_name(const char *srcEncoding,
char srcEncodingLaundered[ENCODING_MAX_LEN])
{
snprintf(srcEncodingLaundered, ENCODING_MAX_LEN, "%s", srcEncoding);
for (int i = 0; srcEncodingLaundered[i]; ++i)
{
if (srcEncodingLaundered[i] == '-')
srcEncodingLaundered[i] = '_';
}
}

static void generate(FILE *c_file, FILE *h_file, const char *srcEncoding,
const char *comment)
{
iconv_t sConv = iconv_open("UTF-8", srcEncoding);
if (sConv == (iconv_t)(-1))
{
fprintf(stderr, "iconv_open(%s) failed\n", srcEncoding);
exit(1);
}
char srcEncodingLaundered[ENCODING_MAX_LEN];
launder_name(srcEncoding, srcEncodingLaundered);
fprintf(c_file, "/* %s */\n", comment);
fprintf(c_file, "static const CPLCodePageConvTable CPL_%s_to_UTF8 = {\n",
srcEncodingLaundered);
for (int i = 0; i <= 255; ++i)
{
unsigned char c = (unsigned char)i;
size_t size_in = 1;
unsigned char out[4] = {0, 0, 0, 0};
size_t size_out = sizeof(out);
char *p_in = (char *)&c;
char *p_out = (char *)out;
size_t nConverted = iconv(sConv, &p_in, &size_in, &p_out, &size_out);
if (i <= 127)
{
assert(out[0] == i);
continue;
}
if (nConverted != (size_t)-1)
{
const size_t needed = sizeof(out) - size_out;
assert(needed <= 3);
fprintf(c_file, " {0x%02X, 0x%02X, 0x%02X},\n", out[0], out[1],
out[2]);
}
else
{
fprintf(c_file, " {0, 0, 0}, /* invalid */\n");
}
}
fprintf(c_file, "};\n\n");
iconv_close(sConv);
}

int main()
{
FILE *c_file = fopen("cpl_character_sets.c", "wb");
FILE *h_file = fopen("cpl_character_sets.h", "wb");
fprintf(c_file, "/* This file has been generated by "
"generate_character_set_conv_tables.c */\n");
fprintf(c_file, "/* DO NOT EDIT !*/\n\n");
fprintf(c_file, "/* clang-format off */\n");
fprintf(c_file, "#include \"cpl_port.h\"\n");
fprintf(c_file, "#include \"cpl_character_sets.h\"\n\n");

fprintf(h_file, "/* This file has been generated by "
"generate_character_set_conv_tables.c */\n");
fprintf(h_file, "/* DO NOT EDIT !*/\n\n");
fprintf(h_file, "/* clang-format off */\n");
fprintf(h_file, "typedef unsigned char CPLCodePageConvTable[128][3];\n");

const struct
{
const char *name;
const char *comment;
} encodings[] = {
{"CP437", "Character set of original IBM PC"},
{"CP1250", "Central and eastern Europe languages"},
{"CP1251", "Cyrillic script"},
{"CP1252",
"Legacy Windows single-byte character set used in a lot of countries"},
{"ISO-8859-2", "Central Europe languages"},
{"ISO-8859-15", "New Western Europe"},
{NULL, NULL}};

for (int i = 0; encodings[i].name; ++i)
{
generate(c_file, h_file, encodings[i].name, encodings[i].comment);
}
fprintf(h_file, "\n");
fprintf(h_file, "const CPLCodePageConvTable* "
"CPLGetConversionTableToUTF8(const char* pszEncoding);\n");

fprintf(c_file, "\nconst CPLCodePageConvTable* "
"CPLGetConversionTableToUTF8(const char* pszEncoding)\n");
fprintf(c_file, "{\n");
for (int i = 0; encodings[i].name; ++i)
{
char srcEncodingLaundered[ENCODING_MAX_LEN];
launder_name(encodings[i].name, srcEncodingLaundered);
fprintf(c_file, " if (EQUAL(pszEncoding, \"%s\"))\n",
encodings[i].name);
fprintf(c_file, " return &CPL_%s_to_UTF8;\n",
srcEncodingLaundered);
}
fprintf(c_file, " return CPL_NULLPTR;\n");
fprintf(c_file, "}\n");
fprintf(c_file, "/* clang-format on */\n");
fprintf(h_file, "/* clang-format on */\n");
fclose(c_file);
fclose(h_file);
return 0;
}
Loading
Loading