如何替换PDF中的文本?

问题描述 投票:0回答:1

对于我的项目,我需要用JS替换PDF中的文本。我取得了进展,但专业程序(如 Word 或 LibreWriter)的 PDF 输出没有任何可理解和可替换的文本。我需要做什么?

基本原始 PDF 文件(仅一部分): 4 0 对象 << /Length 132 >> 溪流 BT /F1 12 特夫 50 150 TD (这是一个简单的PDF)Tj 0 -20 时间 /F2 12 特夫 (这是粗体文字)Tj 0 -20 时间 /F3 12 特夫 (这是斜体文本)Tj ET 尾流 结束对象

在上面我们可以很容易地看到文本,但是Word的PDF等没有真正的文本。我需要知道如何解析文件。任何文章或帮助都会有帮助,非常感谢您的阅读。

我尝试用 fs 流解析和替换,使用常量文本是可能的,但输入来自其他文件或用户无法工作。这是基本代码:

const fs = require('fs');
const path = require('path');

var pdfData = null;
let parsedText = null;
let fontMap = {};


function readPDFFile(filePath) {
    try {
        pdfData = fs.readFileSync(filePath, 'utf8');
        return pdfData;
    } catch (err) {
        console.error(`Could not read the PDF file: ${err.message}`);
        return null;
    }
}

function parsePDF() {
    if (!pdfData) {
        console.error('No PDF data available.');
        return null;
    }

    const lines = pdfData.split('\n');
    const text = [];
    fontMap = {};

    let inStream = false;
    let currentFont = null;

    lines.forEach(line => {
        line = line.trim();

        if (line === "stream") {
            inStream = true;
        } else if (line === "endstream") {
            inStream = false;
        } else if (line.startsWith('/BaseFont')) {
            const match = /\/BaseFont\s\/(\w+)(?:-Bold|-Oblique)?/.exec(line);
            if (match) {
                currentFont = match[1];
                fontMap[currentFont] = match[0];
            }
        } else if (inStream) {
            const match = /\((.*)\) Tj/.exec(line);
            if (match) {
                text.push({ text: match[1], font: currentFont });
            }
        }
    });

    parsedText = text;
    return parsedText;
}

function processParsedText(style) {
    if (!parsedText) {
        console.error('No parsed text available.');
        return null;
    }

    const styleMap = {
        Italic: '-Oblique',
        Bold: '-Bold'
    };

    if (!styleMap[style]) {
        console.error('Invalid style provided.');
        return null;
    }

    parsedText = parsedText.map(item => {
        if (item.font) {
            item.font = `${item.font}${styleMap[style]}`;
        }
        return item;
    });

    return parsedText;
}

function replace(parsedText, newText) {
    if (!parsedText) {
        console.error('No parsed text provided.');
        return null;
    }

    if (!newText || newText.length !== parsedText.length) {
        console.error('Invalid new text provided.');
        return null;
    }

    const newContent = parsedText.map((item, index) => ({
        text: newText[index],
        font: item.font
    }));

    let updatedPDF = pdfData;
    parsedText.forEach((item, index) => {
        const regex = new RegExp(`\\(${item.text}\\) Tj`, 'g');
        updatedPDF = updatedPDF.replace(regex, `(${newContent[index].text}) Tj`);
    });

    pdfData = updatedPDF;

    return newContent;
}

function saveToFile(filename) {
    if (!pdfData || !parsedText) {
        console.error('No data available.');
        return;
    }

    const combinedContent = `${pdfData}`;
    fs.writeFileSync(filename, combinedContent, 'utf8');
}

function processPDF(inputFilePath, outputFilePath) {
    readPDFFile(inputFilePath);
    parsePDF();
    saveToFile(outputFilePath);
}

module.exports = {
    readPDFFile,
    parsePDF,
    processParsedText,
    replace,
    saveToFile,
    processPDF,
    getPdfData: () => pdfData,
    getParsedText: () => parsedText
};
javascript pdf pdf-generation
1个回答
0
投票

要使用 JavaScript 将 PDF 解压缩为纯文本(例如 ANSI 或 JSON),请考虑如何使用连贯的 cpdf JS API。

我使用命令行方法来完成此类任务,因为一次性编辑更容易。但对于可编程方法,您需要 API 库。

作为示例,编码格式的 text.pdf 文件(我很清楚)将如下所示。

2 0 obj
<</Filter/FlateDecode/Length 361>>
stream
xM‘Ñn‚0†ïû'Ù…xQjQ”-ñÂeš]˜ÌL^ Ð"Ý€’Ò§ bB_Ïéþóóž!Y, •°Â÷ž@ZàT   ݤêê`¥síc…Q¶3u˜›š5ʱOÓ§†Iáû2'YŒ­÷WQ·•
[YÌgÄË­Qî¦ð RΡN8ŸAäOIðê?7I¦:¡xêƒïˆÇŽ@ ðÍ3$O-' tÝ(ÃRœL@èò®—#=ô$B´ï5‹×èãYž/ÆhÅEn@4Æ•ÊÂéã0mµ«:ìð)‚›bì0GÌÆd?*wC’H¬B£Ë¶õÁ½xÝm4îœ}ˆÜÿŽÊ|589ë‹+á¬ò?«À•ºƒBcðG‰„7½v%ì¤É|+!•_€Æ@Gc!¯tþû0Ö÷}(|÷`LäÖdÂ1ÓTºQ¬Ã‘Ý…¥«+ÔÙ§$BCëMÅÀÇ*r&ÿ#ɤÅ
endstream
endobj

但作为可编辑文本(

cpdf -decompress -no-preserve-objstm out.pdf -o text.pdf
),它可以被视为。

10 0 obj
<</Length 588>>
stream
BT 15 800 Td 40 TL/1 18 Tf(PlainText = http://foersom.com/net/HowTo/data/OoPdfFormExample.pdf)'/1 16 Tf 40 TL(12  letters.)' 20 TL(9)' 27 20 Td(letters.)' -27 0 Td(A)' 25 20 Td(a)' 17 20 Td(a)' 18 20 Td(a)' 23 20 Td(a)' -72 20 Td(b)' 58 20 Td(b)' -35 20 Td(c)' 17 20 Td(d)' 26.5 20 Td(r)' -58 20 Td(r)' 40 TL(goto page in another PDF)' 20 TL(Also plain text = https://pdfobject.com/pdf/sample-3pp.pdf#page=2)' 25 TL/1 11 Tf 120 140 Td(Sign and Secure this file.txt as a PDF with Adobe Reader)' -5 -40 Td(or click https://www.adobe.com/acrobat/online/sign-pdf.html)' ET 220 680 150 50 re S
endstream
endobj

或 JSON (

cpdf in.pdf -output-json -utf8 -output-json-parse-content-streams -o out.json
)。

{
      "S": [
        {},
        [
          [ "BT" ],
          [ { "F": 15.0 }, { "F": 800.0 }, "Td" ],
          [ { "F": 40.0 }, "TL" ],
          [ "/1", { "F": 18.0 }, "Tf" ],
          [
            "PlainText = http://foersom.com/net/HowTo/data/OoPdfFormExample.pdf",
            "'"
          ],
          [ "/1", { "F": 16.0 }, "Tf" ],
          [ { "F": 40.0 }, "TL" ],
          [ "12  letters.", "'" ],
          [ { "F": 20.0 }, "TL" ],
          [ "9", "'" ],
          [ { "F": 27.0 }, { "F": 20.0 }, "Td" ],
          [ "letters.", "'" ],
          [ { "F": -27.0 }, { "F": 0.0 }, "Td" ],
          [ "A", "'" ],
          [ { "F": 25.0 }, { "F": 20.0 }, "Td" ],
          [ "a", "'" ],
          [ { "F": 17.0 }, { "F": 20.0 }, "Td" ],
          [ "a", "'" ],
          [ { "F": 18.0 }, { "F": 20.0 }, "Td" ],
          [ "a", "'" ],
          [ { "F": 23.0 }, { "F": 20.0 }, "Td" ],
          [ "a", "'" ],
          [ { "F": -72.0 }, { "F": 20.0 }, "Td" ],
          [ "b", "'" ],
          [ { "F": 58.0 }, { "F": 20.0 }, "Td" ],
          [ "b", "'" ],
          [ { "F": -35.0 }, { "F": 20.0 }, "Td" ],
          [ "c", "'" ],
          [ { "F": 17.0 }, { "F": 20.0 }, "Td" ],
          [ "d", "'" ],
          [ { "F": 26.5 }, { "F": 20.0 }, "Td" ],
          [ "r", "'" ],
          [ { "F": -58.0 }, { "F": 20.0 }, "Td" ],
          [ "r", "'" ],
          [ { "F": 40.0 }, "TL" ],
          [ "goto page in another PDF", "'" ],
          [ { "F": 20.0 }, "TL" ],
          [
            "Also plain text = https://pdfobject.com/pdf/sample-3pp.pdf#page=2",
            "'"
          ],
          [ { "F": 25.0 }, "TL" ],
          [ "/1", { "F": 11.0 }, "Tf" ],
          [ { "F": 120.0 }, { "F": 140.0 }, "Td" ],
          [ "Sign and Secure this file.txt as a PDF with Adobe Reader", "'" ],
          [ { "F": -5.0 }, { "F": -40.0 }, "Td" ],
          [
            "or click https://www.adobe.com/acrobat/online/sign-pdf.html",
            "'"
          ],
          [ "ET" ],
          [
            { "F": 220.0 },
            { "F": 680.0 },
            { "F": 150.0 },
            { "F": 50.0 },
            "re"
          ],
          [ "S" ]
        ]
      ]

因此文本更容易编辑(当仔细完成时),但是
JSON (

cpdf -j in.json -o out.pdf
) 或 TXT (
cpdf pdf.txt -o text2.pdf
)
可以反转为压缩的.pdf格式。

© www.soinside.com 2019 - 2024. All rights reserved.