import * as sdk from "microsoft-cognitiveservices-speech-sdk";

import {
  recordingSettings,
  speechConfigSettings,
  transcriptionSettings,
} from "@/settings";
import Storage from "@/core/utils/LocalStorage";
import {
  LookupWord,
  LookupWordType,
  WordSegment,
  Pause,
  TranscriptionSegment,
  TranscriptionSlide,
} from "@/core/models";
import LocalStorage from "@/core/utils/LocalStorage";

import stopWordsDe from "stopwords-de";
import stopWordsEn from "stopwords-en";

export class LiveTranscription {
  // Public data
  slide = 0;
  text = "";
  isTranscribing = false; // Reflects if transcription is actually busy on a segment
  running = false; // Reflects transcription recognition active state
  slides: TranscriptionSlide[] | undefined = [];
  duration = 0; // Realtime duration, uses a timer instead of recognition duration values
  recognizedDuration = 0; // Real duration captured on last recognition cycle end
  offset = 0; // Real offsets to be calculated per slide timer duration
  durations: { duration: number; offset: number; slide: number }[] = []; // Timer bundle collected per slide
  durationInterval: any;
  isPaused = false; // To be synced with isPaused on Recorder instance
  lastCallback = false;

  lang = LocalStorage.getLocale();
  fillerWords =
    this.lang === "de"
      ? transcriptionSettings.fillerWordsDe
      : transcriptionSettings.fillerWordsEn;
  stopWords = this.lang === "de" ? stopWordsDe : stopWordsEn;

  // Private data
  private segments: TranscriptionSegment[] = [];
  private pauses: Pause[] = [];
  private recognizer: sdk.SpeechRecognizer | null = null;
  private readonly onDoneCallback = () => {};

  constructor(cb = () => {}) {
    this.onDoneCallback = cb;
  }

  // Public methods
  async start(
    stream?: MediaStream,
    lang: string = "en-US",
    device?: string,
    cb: (
      text: string,
      isTranscribing: boolean,
      segment?: TranscriptionSegment,
      slide?: TranscriptionSlide,
      pauses?: Pause[],
    ) => void = () => {},
  ): Promise<void> {
    this.segments = [];
    this.pauses = [];
    this.slides = [];
    this.slide = 0;
    this.duration = 0;
    this.recognizedDuration = 0;
    this.offset = 0;
    this.durations = [];

    const recognizer = this.setup(stream, lang, device);

    // Start duration timer early so it keeps in sync with recording timer
    this.durationInterval = setInterval(() => {
      if (!this.isPaused) this.duration += 100;
    }, 100);

    let lastRecognized = "";
    let lastSegment: TranscriptionSegment;

    // Transcription event handlers
    recognizer.sessionStarted = (_s, _e) => {
      this.lastCallback = false;
      this.text = "";
      // Reset duration interval per start to cover pause resumes, but continue the timer
      clearInterval(this.durationInterval);
      this.durationInterval = setInterval(() => {
        if (!this.isPaused) this.duration += 100;
      }, 100);
    };
    recognizer.sessionStopped = (_s, _e) => {
      this.text = "";
      clearInterval(this.durationInterval);
    };

    recognizer.speechStartDetected = (_s, _e) => {};
    recognizer.speechEndDetected = (_s, _e) => {
      this.lastCallback = true;
    };

    recognizer.recognizing = recognizer.recognized = (_s, e) => {
      this.isTranscribing = true;
      if (!e.result.text) return;

      // Text-only callback before the first sentence recognition
      if (!this.segments.length) cb(this.text, this.isTranscribing);

      const speechServiceResponse = e.result.properties.getProperty(
        sdk.PropertyId.SpeechServiceResponse_JsonResult,
      );

      if (sdk.ResultReason[e.result.reason] == "RecognizingSpeech")
        this.text = lastRecognized + e.result.text;
      else {
        lastRecognized += e.result.text + "\r\n";
        this.text = lastRecognized;

        const nBest = speechServiceResponse
          ? JSON.parse(speechServiceResponse).NBest
          : undefined;
        const confidence = nBest ? nBest[0]?.Confidence : undefined;
        const displayWords = e.result.text.split(" ");
        const words = speechServiceResponse
          ? JSON.parse(speechServiceResponse).NBest[0].Words.map(
              (word: any, i: number) => {
                return {
                  word: word.Word,
                  displayWord: displayWords[i] || undefined,
                  duration: word.Duration / 10_000,
                  offset: word.Offset / 10_000,
                };
              },
            )
          : undefined;

        let pause: Pause = {
          offset: 0,
          duration: 0,
          slide: 0,
        };

        if (lastSegment) {
          const duration = Math.round(
            e.result.offset / 10_000 -
              (lastSegment.offset + lastSegment.duration),
          );
          if (duration >= recordingSettings.pauseLen) {
            pause = {
              offset: Math.round(lastSegment.offset + lastSegment.duration),
              duration: duration,
              slide: this.slide,
            };
            this.pauses.push(pause);
          }
        }

        const segment = {
          text: e.result.text,
          offset: Math.round(e.result.offset / 10_000),
          duration: Math.round(e.result.duration / 10_000),
          slide: this.slide,
          nBest: nBest,
          words: words,
          confidence: confidence,
        };
        this.segments.push(segment);
        lastSegment = segment;

        // Accumulate the recognized duration
        this.recognizedDuration += segment.duration;
      }
      this.isTranscribing =
        JSON.parse(speechServiceResponse).RecognitionStatus !== "Success";

      // The full followup callbacks after the first sentence recognition
      this.text &&
        cb(
          this.lastCallback ? lastSegment.text : this.text,
          this.isTranscribing,
          lastSegment,
          this.createSlides()[this.slide],
          this.pauses,
        );
    };
    recognizer.startContinuousRecognitionAsync(() => {
      this.running = true;
    });
    this.recognizer = recognizer;
  }

  async stop() {
    return new Promise<{
      slides: TranscriptionSlide[] | undefined;
      segments: TranscriptionSegment[];
      pauses: Pause[];
    }>((res, _rej) => {
      this.recognizer?.stopContinuousRecognitionAsync(() => {
        this.running = false;
        this.recognizer?.close();
        this.recognizer = null;
        this.pushSlideDuration();
        const slides = this.createSlides();
        this.slides = slides;
        clearInterval(this.durationInterval);
        this.duration = 0;
        this.recognizedDuration = 0;
        this.onDoneCallback();
        res({ slides, segments: this.segments, pauses: this.pauses });
      });
    });
  }

  toggle() {
    if (this.running) void this.stop();
    else void this.start();
  }

  addSlide() {
    this.pushSlideDuration();
    this.recognizer?.stopContinuousRecognitionAsync(() => {
      this.recognizer?.startContinuousRecognitionAsync(() => {
        this.slide += 1;
        this.duration = 0;
        this.recognizedDuration = 0;
      });
    });
  }

  pause() {
    this.recognizer?.stopContinuousRecognitionAsync();
    this.isPaused = true;
  }

  resume() {
    this.recognizer?.startContinuousRecognitionAsync();
    this.isPaused = false;
  }

  // Private methods
  private setup(_stream?: MediaStream, lang = "en-US", device?: string) {
    const audio = device
      ? sdk.AudioConfig.fromMicrophoneInput(device)
      : sdk.AudioConfig.fromDefaultMicrophoneInput();
    const speechConfig = sdk.SpeechConfig.fromEndpoint(
      // Note: To use authorization token with fromEndpoint, pass an empty string to the subscriptionKey in the fromEndpoint method,
      // and then set authorizationToken="token" on the created SpeechConfig instance to use the authorization token.
      new URL(speechConfigSettings.speechServiceEndpoint),
      "",
    );
    this.configureSpeechConfig(speechConfig, lang);

    return new sdk.SpeechRecognizer(speechConfig, audio);
  }
  private configureSpeechConfig(speechConfig: any, _lang: string) {
    // Config in settings.ts
    speechConfig.enableAudioLogging = speechConfigSettings.enableAudioLogging;
    speechConfig.enableTelemetry = speechConfigSettings.enableTelemetry;
    speechConfig.setProperty(
      "30",
      speechConfigSettings.initialSilenceTimeoutMs,
    ); // PropertyId 30 = InitialSilenceTimeoutMs
    speechConfig.speechRecognitionLanguage =
      speechConfigSettings.speechRecognitionLanguage;
    speechConfig.setProfanity(speechConfigSettings.profanityOption);
    speechConfigSettings.requestWordLevelTimestamps &&
      speechConfig.requestWordLevelTimestamps();
    speechConfig.enableDictation = speechConfigSettings.enableDictation;
    speechConfig.outputFormat = speechConfigSettings.outputFormat;
    speechConfig.authorizationToken = Storage.getAccessToken();
  }

  private pushSlideDuration() {
    if (this.slide === 0) this.offset = 0;
    if (this.slide === 1) this.offset += this.durations[0].duration;
    if (this.slide > 1) this.offset += this.durations[this.slide - 1].duration;

    this.durations.push({
      duration: this.duration,
      offset: this.offset,
      slide: this.slide,
    });
  }

  private createSlides(): TranscriptionSlide[] {
    if (this.segments.length === 0) return [];

    const lastSlide = this.segments[this.segments.length - 1].slide;
    const slides: TranscriptionSlide[] = [];

    for (let i = 0; i <= lastSlide; i++) {
      const slideSegments: TranscriptionSegment[] = [];
      for (let j = 0; j < this.segments.length; j++) {
        if (this.segments[j].slide < i) continue;
        else if (this.segments[j].slide > i) break;
        slideSegments.push(this.segments[j]);
      }

      const text = slideSegments.map(segment => segment.text).join(" ");

      const duration =
        i === this.slide ? this.duration : this.durations[i].duration;

      const offset =
        i === 0 ? 0 : i === this.slide ? this.offset : this.durations[i].offset;

      const words: WordSegment[] = [];
      slideSegments.map(segment => {
        if (segment.words) words.push(...segment.words);
      });

      const lookupWords: LookupWord[] = [];
      words.map(word => {
        if (lookupWords.some(lookupWord => lookupWord.text === word.word))
          lookupWords[
            lookupWords.findIndex(lookupWord => lookupWord.text === word.word)
          ].occurrences++;
        else
          lookupWords.push({
            text: word.word,
            occurrences: 1,
            type: this.fillerWords.includes(word.word)
              ? LookupWordType.filler
              : this.stopWords.includes(word.word)
              ? LookupWordType.stop
              : LookupWordType.default,
          });
      });

      const pausesMade = this.pauses.filter(x => x.slide === i).length;
      const totalPauseTime =
        Math.round(
          this.pauses
            .map(x => {
              return x.slide === i ? x.duration : 0;
            })
            .reduce((cum, cur) => cum + cur, 0) * 10,
        ) / 10;
      const avgPauseLength =
        Math.round((totalPauseTime / this.pauses.length) * 10) / 10;

      const pausesPerMinute =
        Math.round(
          (this.pauses.filter(x => x.slide === i).length /
            (duration / 1000 / 60)) *
            100,
        ) / 100;

      const wordsPerMinute =
        Math.round(
          (words.length / (this.recognizedDuration / 1000 / 60)) * 100,
        ) / 100;

      // Set all slide values
      slides.push({
        text,
        slide: i,
        duration: duration,
        wpm: wordsPerMinute,
        offset: offset,
        avgPauseLength: avgPauseLength,
        totalPauseTime: totalPauseTime,
        pausesMade: pausesMade,
        ppm: pausesPerMinute,
        pausesPerMinute: pausesPerMinute,
        lookupWords: lookupWords,
        words: words,
      });
      console.log("Create Slides Result:", JSON.parse(JSON.stringify(slides)));
    }
    return slides;
  }
}
