1import PDFJS from "pdfjs-dist";
2import PDFJSWorker from "pdfjs-dist/build/pdf.worker.js"; // add this to fit 2.3.0
3
4PDFJS.disableTextLayer = true;
5PDFJS.disableWorker = true; // not availaible anymore since 2.3.0 (see imports)
6
7const getPageText = async (pdf: Pdf, pageNo: number) => {
8 const page = await pdf.getPage(pageNo);
9 const tokenizedText = await page.getTextContent();
10 const pageText = tokenizedText.items.map(token => token.str).join("");
11 return pageText;
12};
13
14/* see example of a PDFSource below */
15export const getPDFText = async (source: PDFSource): Promise<string> => {
16 Object.assign(window, {pdfjsWorker: PDFJSWorker}); // added to fit 2.3.0
17 const pdf: Pdf = await PDFJS.getDocument(source).promise;
18 const maxPages = pdf.numPages;
19 const pageTextPromises = [];
20 for (let pageNo = 1; pageNo <= maxPages; pageNo += 1) {
21 pageTextPromises.push(getPageText(pdf, pageNo));
22 }
23 const pageTexts = await Promise.all(pageTextPromises);
24 return pageTexts.join(" ");
25};
26