We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
I made these string conversion functions so codevct can be removed from common.cpp:
// Helper function to convert UTF-8 to UTF-32 std::u32string utf8_to_utf32(const std::string& input){ std::u32string result; result.reserve(input.size()); // Reserve space (will likely need less) // Process input string byte by byte for(size_t i = 0; i<input.size(); i++){ char32_t codepoint = 0; unsigned char c = static_cast<unsigned char>(input[i]); if(c<0x80){ // 1-byte sequence: 0xxxxxxx codepoint = c; } else if((c&0xE0)==0xC0){ // 2-byte sequence: 110xxxxx 10xxxxxx if(i+1>=input.size()||(input[i+1]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence"); codepoint = ((c&0x1F)<<6)|(input[i+1]&0x3F); i += 1; // Check for overlong encoding if(codepoint<0x80) throw std::runtime_error("Overlong UTF-8 encoding"); } else if((c&0xF0)==0xE0){ // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx if(i+2>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence"); codepoint = ((c&0x0F)<<12)|((input[i+1]&0x3F)<<6)|(input[i+2]&0x3F); i += 2; // Check for overlong encoding or UTF-16 surrogates if(codepoint<0x800||(codepoint>=0xD800&&codepoint<=0xDFFF)) throw std::runtime_error("Invalid UTF-8 encoding"); } else if((c&0xF8)==0xF0){ // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx if(i+3>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80||(input[i+3]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence"); codepoint = ((c&0x07)<<18)|((input[i+1]&0x3F)<<12)|((input[i+2]&0x3F)<<6)|(input[i+3]&0x3F); i += 3; // Check for overlong encoding if(codepoint<0x10000||codepoint>0x10FFFF) throw std::runtime_error("Invalid UTF-8 encoding"); } else{ throw std::runtime_error("Invalid UTF-8 sequence"); } result.push_back(codepoint); } return result; } // Helper function to convert UTF-32 to UTF-8 std::string utf32_to_utf8(const std::u32string& input){ std::string result; result.reserve(input.size()*4); // Reserve max possible space for(char32_t c : input){ // ASCII: 0xxxxxxx if(c<0x80){ result.push_back(static_cast<char>(c)); } // 2-byte sequence: 110xxxxx 10xxxxxx else if(c<0x800){ result.push_back(static_cast<char>(0xC0|(c>>6))); result.push_back(static_cast<char>(0x80|(c&0x3F))); } // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx else if(c<0x10000){ // Check for UTF-16 surrogate pairs which are invalid in UTF-32 if(c>=0xD800&&c<=0xDFFF){ throw std::runtime_error("UTF-16 surrogate values are invalid in UTF-32"); } result.push_back(static_cast<char>(0xE0|(c>>12))); result.push_back(static_cast<char>(0x80|((c>>6)&0x3F))); result.push_back(static_cast<char>(0x80|(c&0x3F))); } // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx else if(c<0x110000){ result.push_back(static_cast<char>(0xF0|(c>>18))); result.push_back(static_cast<char>(0x80|((c>>12)&0x3F))); result.push_back(static_cast<char>(0x80|((c>>6)&0x3F))); result.push_back(static_cast<char>(0x80|(c&0x3F))); } else{ throw std::runtime_error("Invalid Unicode code point"); } } return result; } // UTF-8 to wstring conversion static std::wstring utf8_to_wstring(const std::string& str){ std::wstring result; result.reserve(str.size()); for(size_t i = 0; i<str.size();){ wchar_t wc = 0; unsigned char c = static_cast<unsigned char>(str[i]); if(c<0x80){ // 1-byte sequence wc = c; i += 1; } else if((c&0xE0)==0xC0){ // 2-byte sequence if(i+1>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence"); wc = ((c&0x1F)<<6)|(static_cast<unsigned char>(str[i+1])&0x3F); i += 2; } else if((c&0xF0)==0xE0){ // 3-byte sequence if(i+2>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence"); wc = ((c&0x0F)<<12)|((static_cast<unsigned char>(str[i+1])&0x3F)<<6)|(static_cast<unsigned char>(str[i+2])&0x3F); i += 3; } else if((c&0xF8)==0xF0){ // 4-byte sequence if(i+3>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence"); // For wchar_t, we'll use the replacement character for 4-byte sequences // as they're outside the BMP (Basic Multilingual Plane) wc = 0xFFFD; i += 4; } else{ throw std::runtime_error("Invalid UTF-8 sequence"); } result.push_back(wc); } return result; } // wstring to UTF-8 conversion static std::string wstring_to_utf8(const std::wstring& wstr){ std::string result; result.reserve(wstr.size()*3); // Reserve for worst case for(wchar_t wc : wstr){ if(wc<0x80){ result.push_back(static_cast<char>(wc)); } else if(wc<0x800){ result.push_back(static_cast<char>(0xC0|(wc>>6))); result.push_back(static_cast<char>(0x80|(wc&0x3F))); } else{ result.push_back(static_cast<char>(0xE0|(wc>>12))); result.push_back(static_cast<char>(0x80|((wc>>6)&0x3F))); result.push_back(static_cast<char>(0x80|(wc&0x3F))); } } return result; }
The text was updated successfully, but these errors were encountered:
No branches or pull requests
I made these string conversion functions so codevct can be removed from common.cpp:
The text was updated successfully, but these errors were encountered: