我知道我们可以使用像
pdf-parse
这样的库从 PDF 缓冲区中提取 numpages
。我遇到一个问题,有时我的缓冲区太大而无法保留在内存中。所以我决定直播:
const response = await axios.get(file.href, {
responseType: 'stream'
});
const stream = response.data;
stream.on('data', data => {
...
});
是否有一个库可以帮助我解析缓冲区流以提取页面数量,就像
pdf-parse
对单个缓冲区所做的那样?
希望这对您有帮助,
const axios = require('axios');
const PDFParser = require('pdf-parse');
const { Readable } = require('stream');
async function extractTextFromPDFStream(pdfUrl) {
// Create a readable stream from the PDF file URL
const response = await axios.get(pdfUrl, { responseType: 'stream' });
const pdfStream = response.data;
// Create a custom readable stream that emits chunks of data
const customStream = new Readable({
read(size) {
// Read a chunk of data from the PDF stream
const chunk = pdfStream.read(size);
if (chunk !== null) {
// Push the chunk to the custom stream
this.push(chunk);
} else {
// Signal the end of the stream
this.push(null);
}
},
});
// Initialize pdf-parse with the custom stream
const pdfData = await new Promise((resolve, reject) => {
const chunks = [];
const pdfParser = new PDFParser();
pdfParser.on('pdfParser_dataError', (errData) => {
reject(errData);
});
pdfParser.on('pdfParser_dataReady', (pdfData) => {
resolve(pdfData);
});
customStream.pipe(pdfParser);
});
// Extracted text is available in pdfData.text
return pdfData.text;
}
// Usage
const pdfUrl = 'https://example.com/large.pdf';
extractTextFromPDFStream(pdfUrl)
.then((text) => {
console.log(text);
})
.catch((err) => {
console.error('Error:', err);
});