import { isProductionEnvironment } from '@luminovo/commons';
import { AzureExtractionResult } from '@luminovo/http-client';
import { extract } from './extract/extract';
import { extractCells } from './extract/extractCells';
import { extractKeyValuePairs } from './extract/extractKeyValuePairs';
import { extractLines } from './extract/extractLines';
import { extractParagraphs } from './extract/extractParagraphs';
import { extractTableKeyValuePairs } from './extract/extractTableKeyValuePairs';
import { mergeByConfidence } from './merge/merge';
import { Attribute, AttributeExtractionRule, Extractor, MergedAttributes, Region } from './types';

export interface ExtractionOptions<TAttribute extends Attribute> {
    extractionRules: AttributeExtractionRule<TAttribute>[];
    mergeRules: Array<(regions: Region<TAttribute>[]) => Region<TAttribute>[]>;
    inferenceRules: Array<(input: MergedAttributes<TAttribute>) => MergedAttributes<TAttribute>>;
}

/**
 * The entry point for the pdf-extractor.
 */
export async function processPdf<TAttribute extends Attribute>(
    azureExtractionResult: AzureExtractionResult,
    { extractionRules, inferenceRules, mergeRules }: ExtractionOptions<TAttribute>,
): Promise<MergedAttributes<TAttribute>> {
    // eslint-disable-next-line no-console
    const log = isProductionEnvironment() ? () => {} : console.log;
    const t0 = performance.now();

    /**
     * Configure extractors
     */
    const extractors: Array<Extractor<TAttribute>> = [
        extractCells({ extractionRules }),
        extractKeyValuePairs({ extractionRules }),
        extractTableKeyValuePairs({ extractionRules }),
        extractLines({ extractionRules }),
        extractParagraphs({ extractionRules }),
    ];

    // Step1: Extraction
    let regions = await extract(azureExtractionResult, {
        extractors,
    });

    // Step 2: Merging
    for (const mergeRule of mergeRules) {
        regions = mergeRule(regions);
    }
    let mergedAttributes = mergeByConfidence(regions);

    // Step 3: Inference
    for (const inferenceRule of inferenceRules) {
        mergedAttributes = inferenceRule(mergedAttributes);
    }

    const t1 = performance.now();
    log(`processPdf took ${t1 - t0} milliseconds.`);

    return mergedAttributes;
}
