Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replacement for deprecated codevct string conversion #12151

Open
CommanderLake opened this issue Mar 2, 2025 · 0 comments
Open

Replacement for deprecated codevct string conversion #12151

CommanderLake opened this issue Mar 2, 2025 · 0 comments

Comments

@CommanderLake
Copy link

CommanderLake commented Mar 2, 2025

I made these string conversion functions so codevct can be removed from common.cpp:

// Helper function to convert UTF-8 to UTF-32
std::u32string utf8_to_utf32(const std::string& input){
	std::u32string result;
	result.reserve(input.size()); // Reserve space (will likely need less)
	// Process input string byte by byte
	for(size_t i = 0; i<input.size(); i++){
		char32_t codepoint = 0;
		unsigned char c = static_cast<unsigned char>(input[i]);
		if(c<0x80){
			// 1-byte sequence: 0xxxxxxx
			codepoint = c;
		} else if((c&0xE0)==0xC0){
			// 2-byte sequence: 110xxxxx 10xxxxxx
			if(i+1>=input.size()||(input[i+1]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
			codepoint = ((c&0x1F)<<6)|(input[i+1]&0x3F);
			i += 1;
			// Check for overlong encoding
			if(codepoint<0x80) throw std::runtime_error("Overlong UTF-8 encoding");
		} else if((c&0xF0)==0xE0){
			// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
			if(i+2>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
			codepoint = ((c&0x0F)<<12)|((input[i+1]&0x3F)<<6)|(input[i+2]&0x3F);
			i += 2;
			// Check for overlong encoding or UTF-16 surrogates
			if(codepoint<0x800||(codepoint>=0xD800&&codepoint<=0xDFFF)) throw std::runtime_error("Invalid UTF-8 encoding");
		} else if((c&0xF8)==0xF0){
			// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
			if(i+3>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80||(input[i+3]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
			codepoint = ((c&0x07)<<18)|((input[i+1]&0x3F)<<12)|((input[i+2]&0x3F)<<6)|(input[i+3]&0x3F);
			i += 3;
			// Check for overlong encoding
			if(codepoint<0x10000||codepoint>0x10FFFF) throw std::runtime_error("Invalid UTF-8 encoding");
		} else{ throw std::runtime_error("Invalid UTF-8 sequence"); }
		result.push_back(codepoint);
	}
	return result;
}
// Helper function to convert UTF-32 to UTF-8
std::string utf32_to_utf8(const std::u32string& input){
	std::string result;
	result.reserve(input.size()*4); // Reserve max possible space
	for(char32_t c : input){
		// ASCII: 0xxxxxxx
		if(c<0x80){ result.push_back(static_cast<char>(c)); }
		// 2-byte sequence: 110xxxxx 10xxxxxx
		else if(c<0x800){
			result.push_back(static_cast<char>(0xC0|(c>>6)));
			result.push_back(static_cast<char>(0x80|(c&0x3F)));
		}
		// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
		else if(c<0x10000){
			// Check for UTF-16 surrogate pairs which are invalid in UTF-32
			if(c>=0xD800&&c<=0xDFFF){ throw std::runtime_error("UTF-16 surrogate values are invalid in UTF-32"); }
			result.push_back(static_cast<char>(0xE0|(c>>12)));
			result.push_back(static_cast<char>(0x80|((c>>6)&0x3F)));
			result.push_back(static_cast<char>(0x80|(c&0x3F)));
		}
		// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		else if(c<0x110000){
			result.push_back(static_cast<char>(0xF0|(c>>18)));
			result.push_back(static_cast<char>(0x80|((c>>12)&0x3F)));
			result.push_back(static_cast<char>(0x80|((c>>6)&0x3F)));
			result.push_back(static_cast<char>(0x80|(c&0x3F)));
		} else{ throw std::runtime_error("Invalid Unicode code point"); }
	}
	return result;
}
// UTF-8 to wstring conversion
static std::wstring utf8_to_wstring(const std::string& str){
	std::wstring result;
	result.reserve(str.size());
	for(size_t i = 0; i<str.size();){
		wchar_t wc = 0;
		unsigned char c = static_cast<unsigned char>(str[i]);
		if(c<0x80){
			// 1-byte sequence
			wc = c;
			i += 1;
		} else if((c&0xE0)==0xC0){
			// 2-byte sequence
			if(i+1>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
			wc = ((c&0x1F)<<6)|(static_cast<unsigned char>(str[i+1])&0x3F);
			i += 2;
		} else if((c&0xF0)==0xE0){
			// 3-byte sequence
			if(i+2>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
			wc = ((c&0x0F)<<12)|((static_cast<unsigned char>(str[i+1])&0x3F)<<6)|(static_cast<unsigned char>(str[i+2])&0x3F);
			i += 3;
		} else if((c&0xF8)==0xF0){
			// 4-byte sequence
			if(i+3>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
			// For wchar_t, we'll use the replacement character for 4-byte sequences
			// as they're outside the BMP (Basic Multilingual Plane)
			wc = 0xFFFD;
			i += 4;
		} else{ throw std::runtime_error("Invalid UTF-8 sequence"); }
		result.push_back(wc);
	}
	return result;
}
// wstring to UTF-8 conversion
static std::string wstring_to_utf8(const std::wstring& wstr){
	std::string result;
	result.reserve(wstr.size()*3); // Reserve for worst case
	for(wchar_t wc : wstr){
		if(wc<0x80){ result.push_back(static_cast<char>(wc)); } else if(wc<0x800){
			result.push_back(static_cast<char>(0xC0|(wc>>6)));
			result.push_back(static_cast<char>(0x80|(wc&0x3F)));
		} else{
			result.push_back(static_cast<char>(0xE0|(wc>>12)));
			result.push_back(static_cast<char>(0x80|((wc>>6)&0x3F)));
			result.push_back(static_cast<char>(0x80|(wc&0x3F)));
		}
	}
	return result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant