我正在尝试使用我的 NodeJS/Loopback 3 服务器以及
deepl-node
库来翻译 XML 文档。
到目前为止我的代码看起来像这样:
const { Translator, TextResult } = require('deepl-node');
const xml2js = require('xml2js');
const apiKey = process.env.DEEPL_API_KEY;
const translator = apiKey ? new Translator(apiKey) : null;
const DeepLService = {
// Function to escape markdown-sensitive characters
escapeMarkdown: (text) => {
return text
.replace(/\*/g, '__ASTERISK__') // Escape asterisks used for bold and italics
.replace(/~/g, '__TILDE__') // Escape tilde used for strikethrough
.replace(/\n/g, '__NEWLINE__'); // Escape newlines to preserve them
},
// Function to restore escaped markdown characters
unescapeMarkdown: (text) => {
return text
.replace(/__ASTERISK__/g, '*')
.replace(/__TILDE__/g, '~')
.replace(/__NEWLINE__/g, '\n');
},
// Function to translate attributes like 'title' inside <media> tags
translateAttributes: async (parsedXml, targetLang) => {
const mediaTitles = [];
// Extract all media titles that need to be translated
if (parsedXml.workflow && parsedXml.workflow.worktask) {
parsedXml.workflow.worktask.forEach((task) => {
if (task.workitem) {
task.workitem.forEach((workitem) => {
if (workitem.media) {
workitem.media.forEach((media) => {
if (media.$ && media.$.title) {
// Escape markdown in the title
mediaTitles.push(DeepLService.escapeMarkdown(media.$.title));
}
});
}
});
}
});
}
// Translate media titles
const translatedTitles = await Promise.all(
mediaTitles.map(async (title) => {
const result = await translator.translateText(title, null, targetLang);
return DeepLService.unescapeMarkdown(result.text); // Unescape after translation
})
);
// Restore translated titles into the media elements
let titleIndex = 0;
if (parsedXml.workflow && parsedXml.workflow.worktask) {
parsedXml.workflow.worktask.forEach((task) => {
if (task.workitem) {
task.workitem.forEach((workitem) => {
if (workitem.media) {
workitem.media.forEach((media) => {
if (media.$ && media.$.title) {
media.$.title = translatedTitles[titleIndex++];
}
});
}
});
}
});
}
return parsedXml;
},
parseXML: async (xml, targetLang) => {
try {
const parser = new xml2js.Parser();
const builder = new xml2js.Builder();
const parsedXml = await parser.parseStringPromise(xml);
// Preprocessing: Escape markdown-sensitive characters in <text> tags
if (parsedXml.workflow && parsedXml.workflow.worktask) {
parsedXml.workflow.worktask.forEach((task) => {
if (task.workitem) {
task.workitem.forEach((workitem) => {
if (workitem.text) {
workitem.text[0] = DeepLService.escapeMarkdown(workitem.text[0]);
}
});
}
});
}
// Translate media 'title' attributes separately
const updatedXmlWithAttributes = await DeepLService.translateAttributes(parsedXml, targetLang);
return builder.buildObject(updatedXmlWithAttributes);
} catch (err) {
throw new Error(`Parsing error: ${err}`);
}
},
translate: async (xml, targetLang) => {
try {
if (!translator) {
return xml;
}
const updatedXml = await DeepLService.parseXML(xml, targetLang);
const result = await translator.translateText(
updatedXml,
null,
targetLang,
{
tagHandling: 'xml',
splitSentences: 'nonewlines',
ignoreTags: ['fe', 'type', 'symbol', 'media', 'trainingtype', 'WorldMap', 'worldmap', 'assetbundle', 'creationTime']
}
);
// Postprocessing: Unescape markdown characters after translation
let translatedXml = result.text;
if (translatedXml) {
translatedXml = DeepLService.unescapeMarkdown(translatedXml);
}
return {
translatedXML: translatedXml,
sourceLang: result.detectedSourceLang
};
} catch (error) {
console.error('DeepL API error:', error.message);
throw new Error('Translation failed');
}
},
getSupportedTargetLanguages: () => {
if (!translator) {
return [];
}
return translator.getTargetLanguages();
}
};
module.exports = DeepLService;
如您所见,我正在尝试翻译
<text>
和 <title>
标签内的所有内容,有时这些标签可以包含 markdown (尤其是 <text>
标签)。
我在这里遇到的问题是,某些语言在翻译时似乎错误地创建了降价。例如,如果在原始文档上我有这个:
**bold text**
,在法语翻译上它看起来像这样:***bold text__**
(忽略翻译的缺失,我现在专注于降价)。
有时它还会在无序列表降价中去掉第一个星号: 原作:
* One * Two * Three
翻译:One * Two * Three
这些错误并不是在所有语言中都会发生,而且非常不一致。 所以,我的问题是,在 DeepL 中进行 Markdown 翻译的最佳方法是什么,以便它在所有语言中保持一致?
如果我没看错的话,您将在文本等中发送
__ASTERISK__
而不是 *
- 这会使模型感到困惑并导致翻译效果更差。 DeepL 本身并不支持 Markdown,但通过在文本中保留 Markdown,我希望比转义文本具有更高的质量。
此外,DeepL 支持 XML - 您可以尝试使用 忽略标签 来转义 Markdown 而不翻译它。