我需要在 C# 中将字符串转换为 UTF-8。我已经尝试了很多方法,但没有一个能达到我想要的效果。 我将字符串转换为字节数组,然后尝试将其写入 XML 文件(编码为 UTF-8....),但要么得到相同的字符串(根本没有编码),要么得到一个列表字节是没用的...... 有人面临同样的问题吗?
编辑: 这是我使用的一些代码:
str= "testé";
byte[] utf8Bytes = Encoding.UTF8.GetBytes(str);
return Encoding.UTF8.GetString(utf8Bytes);
结果是“testé”或者我期望类似“testé”的东西...
如果您想要一个 UTF8 字符串,其中每个字节都是正确的('Ö' -> [195, 0] , [150, 0]),您可以使用以下内容:
public static string Utf16ToUtf8(string utf16String)
{
/**************************************************************
* Every .NET string will store text with the UTF16 encoding, *
* known as Encoding.Unicode. Other encodings may exist as *
* Byte-Array or incorrectly stored with the UTF16 encoding. *
* *
* UTF8 = 1 bytes per char *
* ["100" for the ansi 'd'] *
* ["206" and "186" for the russian 'κ'] *
* *
* UTF16 = 2 bytes per char *
* ["100, 0" for the ansi 'd'] *
* ["186, 3" for the russian 'κ'] *
* *
* UTF8 inside UTF16 *
* ["100, 0" for the ansi 'd'] *
* ["206, 0" and "186, 0" for the russian 'κ'] *
* *
* We can use the convert encoding function to convert an *
* UTF16 Byte-Array to an UTF8 Byte-Array. When we use UTF8 *
* encoding to string method now, we will get a UTF16 string. *
* *
* So we imitate UTF16 by filling the second byte of a char *
* with a 0 byte (binary 0) while creating the string. *
**************************************************************/
// Storage for the UTF8 string
string utf8String = String.Empty;
// Get UTF16 bytes and convert UTF16 bytes to UTF8 bytes
byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16String);
byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes);
// Fill UTF8 bytes inside UTF8 string
for (int i = 0; i < utf8Bytes.Length; i++)
{
// Because char always saves 2 bytes, fill char with 0
byte[] utf8Container = new byte[2] { utf8Bytes[i], 0 };
utf8String += BitConverter.ToChar(utf8Container, 0);
}
// Return UTF8
return utf8String;
}
在我的例子中,DLL 请求也是一个 UTF8 字符串,但不幸的是,UTF8 字符串必须用 UTF16 编码来解释('Ö' -> [195, 0], [19, 32])。因此 ANSI '–'(即 150)必须转换为 UTF16 '–'(即 8211)。如果您也遇到这种情况,可以使用以下替代:
public static string Utf16ToUtf8(string utf16String)
{
// Get UTF16 bytes and convert UTF16 bytes to UTF8 bytes
byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16String);
byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes);
// Return UTF8 bytes as ANSI string
return Encoding.Default.GetString(utf8Bytes);
}
或者本机方法:
[DllImport("kernel32.dll")]
private static extern Int32 WideCharToMultiByte(UInt32 CodePage, UInt32 dwFlags, [MarshalAs(UnmanagedType.LPWStr)] String lpWideCharStr, Int32 cchWideChar, [Out, MarshalAs(UnmanagedType.LPStr)] StringBuilder lpMultiByteStr, Int32 cbMultiByte, IntPtr lpDefaultChar, IntPtr lpUsedDefaultChar);
public static string Utf16ToUtf8(string utf16String)
{
Int32 iNewDataLen = WideCharToMultiByte(Convert.ToUInt32(Encoding.UTF8.CodePage), 0, utf16String, utf16String.Length, null, 0, IntPtr.Zero, IntPtr.Zero);
if (iNewDataLen > 1)
{
StringBuilder utf8String = new StringBuilder(iNewDataLen);
WideCharToMultiByte(Convert.ToUInt32(Encoding.UTF8.CodePage), 0, utf16String, -1, utf8String, utf8String.Capacity, IntPtr.Zero, IntPtr.Zero);
return utf8String.ToString();
}
else
{
return String.Empty;
}
}
如果您需要相反的方式,请参阅Utf8ToUtf16。 希望我能帮上忙。
C# 中的字符串始终 UTF-16,无法“转换”它。只要您在内存中操作字符串,编码就无关紧要,只有当您将字符串写入流(文件、内存流、网络流...)时编码才重要。
如果要将字符串写入XML文件,只需在创建时指定编码即可
XmlWriter
private static string Utf16ToUtf8(string utf16String)
{
/**************************************************************
* Every .NET string will store text with the UTF16 encoding, *
* known as Encoding.Unicode. Other encodings may exist as *
* Byte-Array or incorrectly stored with the UTF16 encoding. *
* *
* UTF8 = 1 bytes per char *
* ["100" for the ansi 'd'] *
* ["206" and "186" for the russian '?'] *
* *
* UTF16 = 2 bytes per char *
* ["100, 0" for the ansi 'd'] *
* ["186, 3" for the russian '?'] *
* *
* UTF8 inside UTF16 *
* ["100, 0" for the ansi 'd'] *
* ["206, 0" and "186, 0" for the russian '?'] *
* *
* We can use the convert encoding function to convert an *
* UTF16 Byte-Array to an UTF8 Byte-Array. When we use UTF8 *
* encoding to string method now, we will get a UTF16 string. *
* *
* So we imitate UTF16 by filling the second byte of a char *
* with a 0 byte (binary 0) while creating the string. *
**************************************************************/
// Get UTF16 bytes and convert UTF16 bytes to UTF8 bytes
byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16String);
byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes);
char[] chars = (char[])Array.CreateInstance(typeof(char), utf8Bytes.Length);
for (int i = 0; i < utf8Bytes.Length; i++)
{
chars[i] = BitConverter.ToChar(new byte[2] { utf8Bytes[i], 0 }, 0);
}
// Return UTF8
return new String(chars);
}
在原帖作者中连接字符串。每个 sting 操作都会导致 .Net 中的字符串重新创建。字符串实际上是一种引用类型。因此,所提供的功能将明显变慢。不要那样做。使用字符数组代替,直接写入其中,然后将结果转换为字符串。在我处理 500 kb 文本的情况下,差异几乎是 5 分钟。
这个例子有帮助吗?
using System;
using System.IO;
using System.Text;
class Test
{
public static void Main()
{
using (StreamWriter output = new StreamWriter("practice.txt"))
{
// Create and write a string containing the symbol for Pi.
string srcString = "Area = \u03A0r^2";
// Convert the UTF-16 encoded source string to UTF-8 and ASCII.
byte[] utf8String = Encoding.UTF8.GetBytes(srcString);
byte[] asciiString = Encoding.ASCII.GetBytes(srcString);
// Write the UTF-8 and ASCII encoded byte arrays.
output.WriteLine("UTF-8 Bytes: {0}", BitConverter.ToString(utf8String));
output.WriteLine("ASCII Bytes: {0}", BitConverter.ToString(asciiString));
// Convert UTF-8 and ASCII encoded bytes back to UTF-16 encoded
// string and write.
output.WriteLine("UTF-8 Text : {0}", Encoding.UTF8.GetString(utf8String));
output.WriteLine("ASCII Text : {0}", Encoding.ASCII.GetString(asciiString));
Console.WriteLine(Encoding.UTF8.GetString(utf8String));
Console.WriteLine(Encoding.ASCII.GetString(asciiString));
}
}
}
class Program
{
static void Main(string[] args)
{
String unicodeString =
"This Unicode string contains two characters " +
"with codes outside the traditional ASCII code range, " +
"Pi (\u03a0) and Sigma (\u03a3).";
Console.WriteLine("Original string:");
Console.WriteLine(unicodeString);
UnicodeEncoding unicodeEncoding = new UnicodeEncoding();
byte[] utf16Bytes = unicodeEncoding.GetBytes(unicodeString);
char[] chars = unicodeEncoding.GetChars(utf16Bytes, 2, utf16Bytes.Length - 2);
string s = new string(chars);
Console.WriteLine();
Console.WriteLine("Char Array:");
foreach (char c in chars) Console.Write(c);
Console.WriteLine();
Console.WriteLine();
Console.WriteLine("String from Char Array:");
Console.WriteLine(s);
Console.ReadKey();
}
}
我已经使用 TerraFX.Interop.Windows 进行了测试,从
string
到 char*
。
我找到了解决办法,我刚刚发明了自己的字符串和字符指针转换器......
namespace DeafMan1983.Interop.Runtime.Utilities;
using System;
public static unsafe class UtilitiesForUTF16
{
/*
* string from char* (UTF16)
*/
public static string CharPointerToString(char* ptr)
{
if (ptr == null)
return string.Empty;
int length = 0;
while (ptr[length] != '\0')
{
length++;
}
return new string(ptr, 0, length);
}
/*
* char* (UTF16) from string
*/
public static char* StringToCharPointer(string input)
{
if (input == null)
return null;
char* utf16Ptr = stackalloc char[input.Length + 1];
fixed (char* inputPtr = input)
{
for (int i = 0; i < input.Length; i++)
{
utf16Ptr[i] = inputPtr[i];
}
inputPtr[input.Length] = '\0';
return inputPtr;
}
}
/*
* It works like strlen, but it uses only char* (UTF16)
*/
public static int CharPointerLength(char* charPtrs)
{
if (charPtrs == null)
return 0;
int length = 0;
while (charPtrs[length] != '\0')
{
length++;
}
return length;
}
}
使用
Program.cs
进行测试
// Test for char* (UTF16)
string string_str1 = "Hello World!";
char* char_str1 = StringToCharPointer(string_str1);
Console.WriteLine($"Result: {CharPointerToString(char_str1)}");
Console.WriteLine($"Length of CharPointer: {CharPointerLength(char_str1)}");
char* char_str2 = stackalloc char[] { 'H', 'e', 'l', 'l', 'o' };
string str2 = CharPointerToString(char_str2);
Console.WriteLine($"Result: {str2}");
Console.WriteLine($"Length of CharPointer: {CharPointerLength(char_str2)}");
我已经使用 TerraFX.Interop.Windows 进行了测试 它不显示中文。耶! 但我不知道是否会发生这种情况,那么我们想添加一些编码类。我会寻找那个。 在此输入图片描述
玩得开心,享受快乐的编码!