/*
 * File management
 */

import {
  FilePageType,
  GrapheneInputFile,
  Mutation,
  OriginalInputPdf,
  OriginalPdf,
  PageAsyncBatchNode,
  PageAsyncBatchType,
  PageAsyncTaskStatus,
  Query,
} from '@src/graphql/types'
import { groupBy, zip as lodashZip } from 'lodash'
import { GET_PAGE_ASYNC_BATCH } from '@src/graphql/queries/file'
import { TIFF_TO_PNG } from '@src/graphql/mutations/file'
import { ApolloClient } from '@apollo/client'
import { from, Observable, zip } from 'rxjs'
import { mergeMap, scan } from 'rxjs/operators'
import { ImageFileData } from './types'
import { tracingFetch } from '@src/utils/observability/tracing'
import { DeployEnvironment, getDeployEnvironment } from './environment'
// something in the build process breaks pdfjs-dist when included normally,
// so we instead load it from a CDN to avoid building it
// eslint-disable-next-line @typescript-eslint/no-explicit-any
if ((window as any).pdfjsLib) {
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  ;(window as any).pdfjsLib.GlobalWorkerOptions.workerSrc =
    'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.14.305/build/pdf.worker.min.js'
}

export const toBase64 = (inputFile: File): Promise<string> =>
  new Promise((resolve, reject) => {
    const reader = new FileReader()
    reader.readAsDataURL(inputFile)
    reader.onload = () => resolve(reader.result as string)
    reader.onerror = (error) => reject(error)
  })

export const base64ToArrayBuffer = (base64: string): ArrayBuffer => {
  const binaryString = atob(base64)
  const len = binaryString.length
  const bytes = new Uint8Array(len)
  for (let i = 0; i < len; i += 1) {
    bytes[i] = binaryString.charCodeAt(i)
  }
  return bytes.buffer
}

type MultipageImageData = {
  images: string[]
  numPages: number
  sizes: { width: number; height: number }[]
}

async function tiffToImage(
  client: ApolloClient<unknown>,
  base64TIFF: string,
): Promise<MultipageImageData> {
  // TODO: this may error out if we have a massive tiff file with code 413. We should monitor it.
  const { data: imagesData } = await client.mutate<Pick<Mutation, 'tiffToPng'>>({
    mutation: TIFF_TO_PNG,
    variables: {
      imageBase64Data: base64TIFF,
    },
  })
  return imagesData!.tiffToPng as unknown as MultipageImageData
}

async function pdfToImage(base64PDF: string): Promise<MultipageImageData> {
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  const pdfjs = (window as any).pdfjsLib
  const imageLoadPromises: Promise<void>[] = []
  const canvases = []
  const sizes = []
  const pdfBytes = base64ToArrayBuffer(base64PDF.replace(/^data:application\/pdf;base64,/, ''))
  const pdf = await pdfjs.getDocument({
    data: new Uint8Array(pdfBytes),
    cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.14.305/cmaps/',
    cMapPacked: true,
    // Workaround for pdf.js remote-code execution vulnerability
    //  See https://expedock.atlassian.net/browse/PD-8973
    isEvalSupported: false,
  }).promise
  const { numPages } = pdf
  for (let pageIdx = 0; pageIdx < numPages; pageIdx += 1) {
    // pages are 1-indexed in this function for some reason
    // eslint-disable-next-line no-await-in-loop
    const page = await pdf.getPage(pageIdx + 1)
    const baseViewport = page.getViewport({ scale: 1.0 })
    // let's try to make the image at least 4K resolution
    const desiredWidth = 3840
    const scale = Math.max(1.0, desiredWidth / Math.max(baseViewport.height, baseViewport.width))
    const viewport = page.getViewport({ scale })
    const pageHeight = viewport.height
    const pageWidth = viewport.width
    // Prepare canvas using PDF page dimensions
    const canvas = document.createElement('canvas')
    const canvasContext = canvas.getContext('2d')
    if (canvasContext != null) {
      canvas.height = pageHeight
      canvas.width = pageWidth
      // Render PDF page into canvas context
      imageLoadPromises.push(page.render({ canvasContext, viewport }).promise)
      canvases.push(canvas)
      sizes.push({ width: pageWidth, height: pageHeight })
    }
  }
  await Promise.all(imageLoadPromises)
  // TODO: use PNG instead of JPG, or use a heuristic... or just upload the raw PDF and store the
  //       selected page numbers
  const images = canvases.map((canvas) => canvas.toDataURL('image/jpeg'))
  return { images, numPages, sizes }
}

async function getSignedFileURLs(
  fileData: ImageFileData[],
  accessToken: string,
): ReturnType<typeof fetch> {
  return await tracingFetch('/cauldron-api/ocr/file/generate-signed-urls', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${accessToken}` },
    body: JSON.stringify({
      filenames: fileData.map(({ filename }) => filename),
      pages: fileData.map(() => 1),
    }),
  })
}

const uploadFileDataToBucket = async (
  fileBytes: string,
  fileType: FilePageType,
  signedUrl: string,
): Promise<Response> => {
  const mimeTypePattern = /(image\/(png|jpeg|jpg|tiff|tif))|(application\/[^;]*)/
  let contentType = (mimeTypePattern.exec(fileBytes) || [''])[0]
  const imageBytes = base64ToArrayBuffer(fileBytes.replace(/^(.*);base64,/, ''))

  // https://expedock.atlassian.net/browse/PD-811
  // restrict non-pdf and non-image file uploads to less than 5mb
  const excelFileSizeLimit = 5000000
  // PD-1374: 100 MB for others
  const nonExcelFileSizeLimit = 100000000
  if (contentType === 'image/jpg') {
    contentType = 'image/jpeg'
  }

  if (!contentType) {
    contentType = 'application/octet-stream'
  }

  if (fileType === FilePageType.Excel && imageBytes.byteLength >= excelFileSizeLimit) {
    const err = {
      message: 'Unable to upload Excel files over 5 MB. Reduce file size and try again.',
    }
    throw err
  }
  if (fileType !== FilePageType.Excel && imageBytes.byteLength >= nonExcelFileSizeLimit) {
    const err = {
      message: 'Unable to upload files over 100 MB. Reduce file size and try again.',
    }
    throw err
  }

  const retries = 3
  const secondsBetweenTries = 1
  let err
  for (let i = 0; i < retries; i += 1) {
    try {
      // eslint-disable-next-line no-await-in-loop
      return await fetch(signedUrl, {
        method: 'PUT',
        // headers: { 'Content-Type': contentType },
        body: imageBytes,
      })
    } catch (e) {
      err = e
      if (i !== 2) {
        // eslint-disable-next-line no-await-in-loop
        await new Promise<void>((resolve) => setTimeout(resolve, secondsBetweenTries * 1000))
      }
    }
  }
  throw err
}

export const uploadFileData = async (
  fileData: ImageFileData[],
  accessToken: string,
): Promise<{ progress$: Observable<number>; viewUrls: string[] }> => {
  // weirdly, our upload breaks if the file has a caps suffix (like image.PNG vs image.png)
  // so we just lowercase it and it works fine.
  const cleanedFileData = fileData.map((data) => {
    const newData = { ...data }
    const filenameSegments = newData.filename.split('.')
    const filenamePrefix = filenameSegments.slice(0, filenameSegments.length - 1)
    const filenameSuffix = filenameSegments[filenameSegments.length - 1]
    newData.filename = `${filenamePrefix}.${filenameSuffix.toLowerCase()}`
    return newData
  })
  const signedURLs = await (await getSignedFileURLs(cleanedFileData, accessToken)).json()
  const putUrls: string[] = signedURLs.signed_urls
  const viewUrls: string[] = signedURLs.view_urls

  // just a guess -- splitting 5-ways should be a good balance between
  // roundtrip latency and upload bandwidth contention
  const concurrencyLimit = 5
  // upload all the data, with at most `concurrencyLimit` uploads at a time
  return {
    progress$: zip(from(cleanedFileData), from(putUrls)).pipe(
      mergeMap(
        ([{ file, type }, putUrl]) => uploadFileDataToBucket(file, type, putUrl),
        concurrencyLimit,
      ),
      scan((acc) => acc + 1, 0),
    ),
    viewUrls,
  }
}

function getImageDimensions(file: string): Promise<{ width: number; height: number }> {
  return new Promise((resolve, reject) => {
    const i = new Image()
    i.onload = () => {
      resolve({ width: i.width, height: i.height })
    }
    i.onerror = (err) => {
      reject(err)
    }
    i.src = file
  })
}

function formatMultiplePageFilename(
  filename: string,
  pageNumber: number,
  totalPages: number,
  suffix: string,
): string {
  if (totalPages === 1) {
    return filename
  }

  const originalFileName = filename.split('.')[0]
  return `${originalFileName}-page ${pageNumber + 1}.${suffix}`
}

function revertFormattedFilename(filename: string): string {
  // only PDFs and TIFFs have multiple page properties
  const isTiff = filename.endsWith('.tiff')
  const isPdf = filename.endsWith('.pdf')
  const isTif = filename.endsWith('.tif')
  const isMultipageDocFormat = isTiff || isPdf || isTif
  let suffix = ''
  if (isTiff) {
    suffix = 'tiff'
  }
  if (isPdf) {
    suffix = 'pdf'
  }
  if (isTif) {
    suffix = 'tif'
  }
  if (isMultipageDocFormat && filename.includes('page')) {
    const withoutPageNum = filename.split('-').slice(0, -1).join('-')
    return `${withoutPageNum}.${suffix}`
  }
  return filename
}

const processMultiPageImage = (
  imageData: MultipageImageData,
  filename: string,
  imagesData: ImageFileData[],
  suffix: string,
): void => {
  const { images, numPages, sizes } = imageData
  for (let page = 0; page < numPages; page += 1) {
    const size = sizes[page]
    const image = images[page]
    const formattedFilename = formatMultiplePageFilename(filename, page, numPages, suffix)
    imagesData.push({
      file: image,
      size,
      filename: formattedFilename,
      pageNumber: page,
      type: FilePageType.ImagePdf,
    })
  }
}

export async function getImageFileData(
  client: ApolloClient<unknown>,
  files: File[],
): Promise<ImageFileData[]> {
  const imagesData = []
  const excelMimeTypes = [
    'application/vnd.ms-excel',
    'application/vnd.ms-excel.sheet.macroEnabled.12',
    'application/msexcel',
    'application/x-msexcel',
    'application/x-ms-excel',
    'application/x-excel',
    'application/x-dos_ms_excel',
    'application/xls',
    'application/x-xls',
    'application/x-msi',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
  ]
  for (let index = 0; index < files.length; index += 1) {
    const file = files[index]
    const filename = file.name
    const mimetype = file.type
    /* eslint-disable-next-line no-await-in-loop */
    const fileData = await toBase64(file)
    if (mimetype === 'application/pdf') {
      /* eslint-disable-next-line no-await-in-loop */
      const imageData = await pdfToImage(fileData)
      processMultiPageImage(imageData, filename, imagesData, '.pdf')
    } else if (mimetype === 'image/tiff') {
      // Modern browsers don't play nice with TIF files, so
      // it's easier to just convert and move on with a supported image format
      const convertedFilename = filename.replace(/.tiff|.tif/g, '.png')
      /* eslint-disable-next-line no-await-in-loop */
      const imageData = await tiffToImage(client, fileData)
      processMultiPageImage(imageData, convertedFilename, imagesData, '.png')
    } else if (mimetype.startsWith('image/')) {
      /* eslint-disable-next-line no-await-in-loop */
      const size = await getImageDimensions(fileData)
      imagesData.push({
        size,
        filename,
        file: fileData,
        pageNumber: 0,
        type: FilePageType.ImagePdf,
      })
    } else if (excelMimeTypes.includes(mimetype)) {
      imagesData.push({
        filename,
        file: fileData,
        type: FilePageType.Excel,
        pageNumber: 0,
      })
    } else {
      imagesData.push({
        filename,
        file: fileData,
        type: FilePageType.Other,
        pageNumber: 0,
      })
    }
  }
  return imagesData
}

export const groupFilesForUpload = (
  filesData: ImageFileData[],
  viewUrls: string[],
  documentTypeIds?: string[],
  uploadedOriginalFiles?: OriginalPdf[],
): GrapheneInputFile[] => {
  if (typeof documentTypeIds === 'undefined') {
    documentTypeIds = []
  }
  // group files together
  const groupedData = groupBy(lodashZip(filesData, viewUrls, documentTypeIds), ([file]) =>
    revertFormattedFilename(file!.filename),
  )
  return Object.entries(groupedData).map(([filename, group]) => {
    const fileUri =
      uploadedOriginalFiles?.find(
        (file) => file?.filename?.toLowerCase() === filename?.toLowerCase(),
      )?.s3Uri ?? null
    return {
      filename,
      pages: group.map(([file, viewUrl, documentTypeId]) => ({
        signedViewUrl: viewUrl!,
        documentTypeId,
        size: {
          height: Math.floor(file!.size?.height ?? 0),
          width: Math.floor(file!.size?.width ?? 0),
        },
        pageNumber: file!.pageNumber,
      })),
      type: group[0][0]!.type,
      originalFileUri: fileUri,
    }
  })
}

export const formatFilePreviewName = (fullFilename: string, pageIndex: number): string => {
  const filenameWithoutExtension = fullFilename.split('.')[0]
  return `${filenameWithoutExtension}_page${pageIndex + 1}`
}

export const isPdfOrImage = (filename: string): boolean => {
  const filenameSplit = filename.split('.')
  const fileExtension = filenameSplit[filenameSplit.length - 1].toLowerCase()
  const isPdfOrImage = ['pdf', 'jpg', 'png', 'jpeg', 'tif'].includes(fileExtension)
  return isPdfOrImage
}

export const replaceFileExtension = (filename: string, fileExtension: string): string => {
  const filenameSplit = filename.split('.')
  filenameSplit[filenameSplit.length - 1] = fileExtension
  const newFileName = filenameSplit.join('.')
  return newFileName
}

class AsyncTaskError extends Error {}

/** provides a pessimistic estimate of how long left till a batch completes */
export const estimateBatchSecondsLeft = (batch: PageAsyncBatchNode): number | null => {
  if (batch.numTasks == null) return null
  // latency, etc
  const defaultBaseSeconds = 1
  // expected time per OCR task
  const ingestSecondsPerTask = 8
  // expected time per autofill task
  const autofillSecondsPerTask = 3
  const numWorkers = getDeployEnvironment() === DeployEnvironment.DEVELOPMENT ? 1 : 4

  const numDone = batch.numDone ?? 0
  const baseSeconds = numDone === 0 ? defaultBaseSeconds : 0
  switch (batch.type) {
    case PageAsyncBatchType.Autofill:
      return baseSeconds + (batch.numTasks - numDone) * autofillSecondsPerTask
    case PageAsyncBatchType.Ingest:
      return (
        baseSeconds +
        ((batch.numTasks - numDone) * (ingestSecondsPerTask + autofillSecondsPerTask)) / numWorkers
      )
    case PageAsyncBatchType.IngestAutofill:
      return (
        baseSeconds +
        ((batch.numTasks - numDone) * (ingestSecondsPerTask + autofillSecondsPerTask)) / numWorkers
      )
    default:
      return null
  }
}

/**
 * Polls until the given batch ID is done or errors out
 * Yields tuples of [% done, % buffer value, estimated seconds left]
 */
export async function* pollPageAsyncBatch(
  client: ApolloClient<unknown>,
  batchId: string,
): AsyncGenerator<[number, number, number | null]> {
  // increasing polling time to 32 minutes to have enough time for 200 pages
  const pollingTimeMinutes = 32
  const pollingTimeMilliseconds = pollingTimeMinutes * 60 * 1000
  const pollingIntervalMilliseconds = 1000
  const maxConsecutiveErrors = 3
  let numConsecutiveErrors = 0
  let error: Error | null = null
  for (let i = 0; i < pollingTimeMilliseconds; i += pollingIntervalMilliseconds) {
    try {
      // eslint-disable-next-line no-await-in-loop
      const { data: pageAsyncBatchData } = await client.query<Pick<Query, 'pageAsyncBatch'>>({
        query: GET_PAGE_ASYNC_BATCH,
        fetchPolicy: 'no-cache',
        variables: {
          id: batchId,
        },
      })
      const batch = pageAsyncBatchData.pageAsyncBatch
      yield [
        batch.numDone != null && batch.numTasks != null ? batch.numDone / batch.numTasks : 0,
        batch.numDone != null && batch.numTasks != null
          ? Math.min(batch.numDone + 0.5, batch.numTasks) / batch.numTasks
          : 0,
        estimateBatchSecondsLeft(batch),
      ]
      if (batch.status === PageAsyncTaskStatus.Done) {
        break
      }
      if (batch.status === PageAsyncTaskStatus.Error) {
        error = new AsyncTaskError(batch.errorMessage ?? undefined)
        break
      }
      // we really do want to await in a loop to do the polling
      // eslint-disable-next-line no-await-in-loop
      await new Promise((r) => setTimeout(r, pollingIntervalMilliseconds))
      numConsecutiveErrors = 0
    } catch (e) {
      numConsecutiveErrors += 1
      if (numConsecutiveErrors >= maxConsecutiveErrors) {
        throw e
      }
      // eslint-disable-next-line no-await-in-loop
      await new Promise((r) => setTimeout(r, pollingIntervalMilliseconds))
    }
  }
  if (error) {
    throw error
  }
}

export const getPDFsFromFilesForUpload = async (files: File[]): Promise<OriginalInputPdf[]> => {
  const pdfMimetype = 'application/pdf'
  const pdfFiles = files.filter((file) => file.type === pdfMimetype)
  const inputPDFs = await Promise.all(
    pdfFiles.map(async (file) => {
      const inputPDF: OriginalInputPdf = {
        fileBytes: await toBase64(file),
        filename: file.name,
      }
      return inputPDF
    }),
  )

  return inputPDFs
}
