import * as pdfJs from 'pdfjs-dist/webpack';
import mammoth from 'mammoth/mammoth.browser';

export const extractTextFromFile = async (file: File) => {
  switch (file.type) {
    case 'application/pdf':
      return await extractTextFromPDFFile(file);
    case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
      return await extractTextFromDocxFile(file);
    case 'text/csv':
      return await extractTextFromCSVFile(file);
    default:
      throw new Error('Unsupported file type');
  }
};

const extractTextFromPDFFile = async (file: File): Promise<string> => {
  const src = URL.createObjectURL(file);
  const pdf = await pdfJs.getDocument(src).promise;
  const pageList = await Promise.all(
    Array.from({ length: pdf.numPages }, (_, i) => pdf.getPage(i + 1))
  );

  const textList = await Promise.all(pageList.map((p) => p.getTextContent()));

  return textList
    .map(({ items }) => items.map(({ str }: { str: string }) => str).join(''))
    .join('');
};

const extractTextFromDocxFile = async (file: File): Promise<string> => {
  const buffer = await file.arrayBuffer();
  const result = await mammoth.extractRawText({ arrayBuffer: buffer });
  return result.value;
};

const extractTextFromCSVFile = async (file: File): Promise<string> => {
  const reader = new FileReader();

  return new Promise((resolve, reject) => {
    reader.onerror = () => {
      reader.abort();
      reject(new DOMException('Problem parsing input file.'));
    };

    reader.onload = () => {
      resolve(reader.result as string);
    };

    reader.readAsText(file);
  });
};
