from typing import Any, Dict, List, Optional

from collections import defaultdict
from statistics import mean

from nltk.tokenize import word_tokenize

from apadata.api.scraper.scraper_api import ScraperAPI
from apadata.text_processors.evaluators.evaluator import Evaluator

DEFAULT_NUMBER_OF_PAGES = 1
DEFAULT_GENERAL_VALUE = 1


def calculate_features_average(data_list: List[Dict[str, Any]]) -> Dict[str, float]:
    average_data = defaultdict(list)
    for data_dict in data_list:
        for (key, value) in data_dict.items():
            if isinstance(value, (int, float)):
                average_data[key].append(value)
    return {key: mean(average_data[key]) for key in average_data}


class KeywordsFrequencyEvaluator(Evaluator):
    """Evaluates the frequency of a keyword relative to some other keywords"""

    def __init__(self, domain: str):
        self.api_instance = ScraperAPI()
        self.domain = domain

    def get_keywords_frequency_features(
        self, keyword: str, domain: Optional[str] = None
    ) -> (Dict)[str, float]:
        frequency_features = self.api_instance.frequency_features(
            keyword=keyword, domain=domain
        )
        return calculate_features_average(frequency_features)

    def compute_frequency_features(
        self,
        fields: Dict[str, Any],
        use_absolute_features: bool = False,
    ) -> List[float]:
        number_of_pages = fields.get("number_of_pages", DEFAULT_NUMBER_OF_PAGES)
        features = []
        for attribute in ["title", "content", "url"]:
            hits = fields.get(f"{attribute}_hits", DEFAULT_GENERAL_VALUE)
            score = fields.get(f"{attribute}_score", DEFAULT_GENERAL_VALUE)
            hits_scaled = hits / number_of_pages
            score_scaled = score / number_of_pages
            hits_to_score = hits / score if score else hits
            features += [
                hits_scaled,
                score_scaled,
                hits_to_score,
            ]
            if use_absolute_features:
                features += [hits, score]
        if use_absolute_features:
            features += [number_of_pages]
        return [min(feature, 1.0) for feature in features]

    def evaluate(self, keyword: str, **kwargs: Any) -> (float):
        do_features_mean: bool = kwargs.get("do_features_mean", True)
        final_result: List[float] = []
        keyword_tokens = word_tokenize(keyword)
        for keyword_token in keyword_tokens + [keyword]:
            features: List[float] = self.compute_frequency_features(
                self.get_keywords_frequency_features(keyword_token, self.domain),
                do_features_mean,
            )
            if do_features_mean:
                final_result += [float(min(mean(features), 1.0))]
            else:
                final_result += [mean(features)] if len(features) else [0.0]
        if len(final_result) == 0:
            return 0.0
        return mean(final_result)
