我想编写一个包含 UTF-8 编码的非 BMP 字符的 XML 文件。
使用以下代码,生成的 XML 文件将非 BMP Unicode 字符替换为数字字符引用。
package xml;
import java.io.File;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class XMLClass {
static String names[] = {"𠀋一郎", "𠮷野","辻󠄀","👨👩👦"};
public static void main(String[] args) {
DocumentBuilder documentBuilder = null;
try {
documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
Document document = documentBuilder.newDocument();
document.setXmlStandalone(true);
Element list = document.createElement("list");
document.appendChild(list);
for (int i = 0; i < names.length; i++) {
Element name = (Element) document.createElement("name").cloneNode(false);
list.appendChild(name);
name.appendChild(document.createTextNode(names[i]));
}
File file = new File("NameList.xml");
write(file, document);
}
public static boolean write(File file, Document document) {
Transformer transformer = null;
try {
TransformerFactory transformerFactory = TransformerFactory.newInstance();
transformer = transformerFactory.newTransformer();
} catch (TransformerConfigurationException e) {
e.printStackTrace();
return false;
}
transformer.setOutputProperty("indent", "yes");
// non-BMP characters written in characters (no numeric character reference style)
// when you set encoding UTF-16
transformer.setOutputProperty("encoding", "UTF-8");
transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "2");
try {
transformer.transform(new DOMSource(document), new StreamResult(
file));
} catch (TransformerException e) {
e.printStackTrace();
return false;
}
return true;
}
}
我的期望是:
<?xmlversion="1.0"encoding="UTF-8"?><list>
<name>𠀋一郎</name>
<name>𠮷野</name>
<name>辻󠄀</name>
<name>👨👩👦</name>
</list>
但是我得到的是:
<?xmlversion="1.0"encoding="UTF-8"?><list>
<name>𠀋一郎</name>
<name>𠮷野</name>
<name>辻󠄀</name>
<name>👨👩👦</name>
</list>
当我指定使用 UTF-8 编码时,如何防止
javax.xml.transform.Transformer
将非 BMP Unicode 字符替换为数字字符引用?
这不是一个漂亮的解决方案,但是......您可以写入管道而不是文件,然后在不同的线程中读取该管道并自行修复数字实体,然后将该固定版本发送到文件:
private static void writeWithoutNumericEntities(Reader xml,
File file) {
StringBuilder entity = new StringBuilder();
try (Writer out = Files.newBufferedWriter(file.toPath())) {
int c;
while ((c = xml.read()) >= 0) {
if (c == '&') {
c = xml.read();
if (c == '#') {
entity.setLength(0);
int codepoint;
c = xml.read();
if (c == 'x' || c == 'X') {
while ((c = xml.read()) != ';') {
entity.append((char) c);
}
codepoint = Integer.parseInt(entity.toString(), 16);
} else {
entity.setLength(0);
do {
entity.append((char) c);
} while ((c = xml.read()) != ';');
codepoint = Integer.parseInt(entity.toString());
}
out.write(Character.toChars(codepoint));
} else {
out.write('&');
out.write(c);
}
} else {
out.write(c);
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
public static boolean write(File file, Document document) {
Transformer transformer = null;
try {
TransformerFactory transformerFactory = TransformerFactory.newInstance();
transformer = transformerFactory.newTransformer();
transformer.setOutputProperty("indent", "yes");
// non-BMP characters written in characters (no numeric character reference style)
// when you set encoding UTF-16
transformer.setOutputProperty("encoding", "UTF-8");
transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "2");
Thread fixXmlThread;
try (PipedWriter destination = new PipedWriter()) {
PipedReader xmlWithNumericEntities = new PipedReader(destination);
fixXmlThread = Thread.startVirtualThread(
() -> writeWithoutNumericEntities(xmlWithNumericEntities, file));
transformer.transform(new DOMSource(document), new StreamResult(destination));
}
fixXmlThread.join();
} catch (TransformerException | IOException | InterruptedException e) {
e.printStackTrace();
return false;
}
return true;
}
这不考虑注释和 CDATA 部分,它们可能包含会破坏上述代码的文本。 不过,可以对其进行修改以考虑注释和 CDATA。