如何从流中获取PDF的总页数?

问题描述 投票:0回答:1

我知道我们可以使用像

pdf-parse
这样的库从 PDF 缓冲区中提取
numpages
。我遇到一个问题,有时我的缓冲区太大而无法保留在内存中。所以我决定直播:

const response = await axios.get(file.href, {
    responseType: 'stream'
});

const stream = response.data;
 
stream.on('data', data => { 
    ...
});

是否有一个库可以帮助我解析缓冲区流以提取页面数量,就像

pdf-parse
对单个缓冲区所做的那样?

javascript node.js stream
1个回答
0
投票

希望这对您有帮助,

const axios = require('axios');
const PDFParser = require('pdf-parse');
const { Readable } = require('stream');

async function extractTextFromPDFStream(pdfUrl) {
  // Create a readable stream from the PDF file URL
  const response = await axios.get(pdfUrl, { responseType: 'stream' });
  const pdfStream = response.data;

  // Create a custom readable stream that emits chunks of data
  const customStream = new Readable({
    read(size) {
      // Read a chunk of data from the PDF stream
      const chunk = pdfStream.read(size);
      if (chunk !== null) {
        // Push the chunk to the custom stream
        this.push(chunk);
      } else {
        // Signal the end of the stream
        this.push(null);
      }
    },
  });

  // Initialize pdf-parse with the custom stream
  const pdfData = await new Promise((resolve, reject) => {
    const chunks = [];
    const pdfParser = new PDFParser();
    
    pdfParser.on('pdfParser_dataError', (errData) => {
      reject(errData);
    });
    
    pdfParser.on('pdfParser_dataReady', (pdfData) => {
      resolve(pdfData);
    });

    customStream.pipe(pdfParser);
  });

  // Extracted text is available in pdfData.text
  return pdfData.text;
}

// Usage
const pdfUrl = 'https://example.com/large.pdf';
extractTextFromPDFStream(pdfUrl)
  .then((text) => {
    console.log(text);
  })
  .catch((err) => {
    console.error('Error:', err);
  });
© www.soinside.com 2019 - 2024. All rights reserved.