from typing import List

from dataclasses import dataclass, field

import spacy

from .constants import (
    LangCode,
    SpacyComponent,
    SpacyGenre,
    SpacyPipeline,
    SpacyWordRootOption,
)


@dataclass
class SpacyConfiguration:
    """
    Configuration for Spacy with the following parameters:


    Parameters
    ----------
    lang_code : LangCode
        2-3 letters language codes
    pipeline_type : SpacyPipeline
        Type of spacy pipeline, has to be sm_pipeline, md_pipeline, lg_pipeline or
        "trf"
    use_transformers : bool
        If set to True, it will load a transformers based model
    disable : bool
        Specify components that should be disabled in order to reduce computation
        time and involved resources (can be any of the: ["tok2vec", "tagger",
        "parser", "attribute_ruler", "lemmatizer", "ner"]
    use_lookup_lemmatizer : bool
        If set to True, then for Dutch, English, French, Greek, Macedonian, Norwegian
        and Spanish (at least, as of August 2023), then the current lemmatizer (which
        can be either trainable or rule-based) to a lookup-based one
    use_senter_over_parser : bool
        Set it to True if you need fast sentence segmentation without dependency
        parses, (basically what is happening is that you disable the parser use the
        senter component instead). The senter component is ~10× faster than the
        parser and more accurate than the rule-based sentencizer. Do check that this
        is allowed.
    use_default_over_trainable : bool
        Since v3.3, a number of pipelines use a trainable lemmatizer. You can check
        whether the lemmatizer is trainable. If you’d like to switch to a
        non-trainable lemmatizer that’s similar to v3.2 or earlier, you can replace
        the trainable lemmatizer with the default non-trainable lemmatizer. Do check
        that this is allowed
    word_root_pipeline: SpacyWordRootOption
        "lemmatizer" or "stemmer" based on what we want to use for word root extraction
    max_input_length: int
        Maximum length of input text
    """

    lang_code: LangCode
    pipeline_type: SpacyPipeline
    pipeline_genre: SpacyGenre
    disable: List[SpacyComponent] = field(default_factory=lambda: [])
    use_lookup_lemmatizer: bool = False
    use_senter_over_parser: bool = False
    use_default_over_trainable: bool = False
    word_root_pipeline: SpacyWordRootOption = SpacyWordRootOption(
        SpacyWordRootOption.LEMMATIZER
    )
    max_input_length: int = 1000000

    def __post_init__(self):
        self.setup()

    @staticmethod
    def is_spacy_pipeline_available(pipeline_name: str) -> bool:
        return spacy.util.is_package(pipeline_name)

    def setup(self) -> None:
        if self.use_lookup_lemmatizer and self.lang_code.value not in [
            "nl",
            "en",
            "fr",
            "el",
            "es",
            "nn",
            "mk",
            "nb",
            "nn",
        ]:
            raise ValueError(
                f"Cannot switch to lookup lemmatizer with {self.lang_code} language!"
            )

    @property
    def pipeline_name(self) -> str:
        return f"{self.lang_code.value}_{self.pipeline_genre}_{self.pipeline_type}"
