from typing import Any, Dict, List, Optional

import elasticsearch

from apadata.constants import ELASTICSEARCH_URL


class ElasticsearchClient:
    """
    Class for instantiating ElasticsearchClient object which will then be used to
    retrieve excerpts from our indexed web content data for a specific domain and
    optionally for a specified keyword as well. This is crucial for our target
    industries pipeline and our chat gpt summary extraction"

    Parameters
    ----------
    index : str
        Elasticsearch index that will be used in order to retrieve web content given
        a domain and optionally a keyword
    """

    def __init__(self, index: str = "content-*"):
        self.index = index

    def get_web_contents(self, hits: List[Dict[str, Any]]) -> List[str]:
        return [self.extract_content(hit) for hit in hits]

    def extract_content(self, hit: Dict[str, Any]) -> str:
        source = hit.get("_source", {})
        title = source.get("title", "")
        content = source.get("content", "")
        return f"{title} {content}"

    def search(
        self,
        domain: str,
        url: str,
        keyword: Optional[str] = None,
        size: int = 1,
        from_: Optional[int] = None,
    ) -> Any:
        with elasticsearch.Elasticsearch(ELASTICSEARCH_URL) as es:
            query = {
                "bool": {
                    "must": [{"match": {"domain": domain}}, {"match": {"url": url}}]
                }
            }

            if keyword:
                query["bool"]["must"] = [
                    {
                        "simple_query_string": {
                            "query": keyword,
                            "fields": ["title", "content"],  # type: ignore
                            "default_operator": "or",
                        }
                    }
                ]

            highlight = {
                "number_of_fragments": 3,
                "fragment_size": 250,
                "fields": {
                    "title": {
                        "pre_tags": ['<span style="background-color: #fff70061">'],
                        "post_tags": ["</span>"],
                    },
                    "content": {
                        "pre_tags": ['<span style="background-color: #fff70061">'],
                        "post_tags": ["</span>"],
                    },
                },
            }
            source_excludes: List[str] = []

            result = es.search(
                index=self.index,
                query=query,
                from_=from_,
                size=size,
                highlight=highlight,
                source_excludes=source_excludes,
            )
        return result["hits"]["hits"]
