from typing import Dict, List

from collections import defaultdict

from apadata.loaders import JSONLoader
from apadata.loaders.xlsx_loader import XLSXLoader
from apadata.utils import flatten

ner_labels_categories_dict = {
    "company": {"ORGANIZATION", "ORG", "orgName", "LC"},
    "space": {"LOC", "GPE", "geogName", "placeName", "NAT_REL_POL", "FACILITY"},
    "time": {"time", "TME", "DATETIME", "date", "TIME", "PERIOD"},
    "person": {"PERSON", "PER", "persName", "PRS"},
    "product": {"PRODUCT", "WRK", "WORK_OF_ART", "OBJ"},
    "numeric": {"NUMERIC_VALUE", "QUANTITY", "MONEY", "CARDINAL", "PERCENT", "ORDINAL"},
    "miscellaneous": {
        "LANGUAGE",
        "OG",
        "EVN",
        "MSR",
        "MISC",
        "TITLE_AFFIX",
        "DT",
        "NORP",
        "LAW",
        "QT",
    },
}

CLEARBIT_INDUSTRIES_NAMES = {
    "Automotive",
    "Consumer Discretionary",
    "Consumer Goods",
    "Household Durables",
    "Leisure Products",
    "Textiles, Apparel & Luxury Goods",
    "Consumer Services",
    "Diversified Consumer Services",
    "Hotels, Restaurants & Leisure",
    "Education Services",
    "Family Services",
    "Specialized Consumer Services",
    "Media",
    "Distributors",
    "Retailing",
    "Specialty Retail",
    "Consumer Staples",
    "Food & Staples Retailing",
    "Beverages",
    "Food Products",
    "Tobacco",
    "Personal Products",
    "Gas Utilities",
    "Banks",
    "Diversified Financial Services",
    "Capital Markets",
    "Insurance",
    "Real Estate",
    "Health Care Equipment & Supplies",
    "Health Care Providers & Services",
    "Biotechnology",
    "Life Sciences Tools & Services",
    "Pharmaceuticals",
    "Aerospace & Defense",
    "Capital Goods",
    "Construction & Engineering",
    "Electrical Equipment",
    "Industrial Conglomerates",
    "Machinery",
    "Trading Companies & Distributors",
    "Commercial Services & Supplies",
    "Professional Services",
    "Industrials",
    "Air Freight & Logistics",
    "Airlines",
    "Marine",
    "Road & Rail",
    "Transportation",
    "Semiconductors & Semiconductor Equipment",
    "Internet Software & Services",
    "IT Services",
    "Software",
    "Communications Equipment",
    "Electronic Equipment, Instruments & Components",
    "Technology Hardware, Storage & Peripherals",
    "Building Materials",
    "Chemicals",
    "Containers & Packaging",
    "Metals & Mining",
    "Paper & Forest Products",
    "Diversified Telecommunication Services",
    "Wireless Telecommunication Services",
    "Renewable Electricity",
    "Electric Utilities",
    "Utilities",
}

SYNONYMS_FILEPATH = "text_processors/synonyms_gpt.json"
INDUSTRY_NAME_TO_SYNONYMS = JSONLoader(SYNONYMS_FILEPATH).load()

INDUSTRY_NAME_TO_SYNONYMS_MULTILANG = JSONLoader(
    filepath="text_processors/tests/industry_name_to_synonyms_multilang.json",
).load()
CLEARBIT_INDUSTRIES_NAMES_MULTILANG = JSONLoader(
    filepath="text_processors/tests/clearbit_industries_names_multilang.json",
).load()


clearbit_to_apadua_industries_mapping = XLSXLoader(
    "text_processors/tests/clearbit_industries_to_apadua_industries.xlsx"
).load()

clearbit_to_apadua_industries = {
    row[2]: row[5] for row in clearbit_to_apadua_industries_mapping[1:]
}


def revert_dictionary(original_dict: Dict[str, str]) -> Dict[str, List[str]]:
    reversed_dict: Dict[str, List[str]] = defaultdict(list)
    for (key, value) in original_dict.items():
        reversed_dict[value].append(key)
    return reversed_dict


apadua_to_clearbit_industries: Dict[str, List[str]] = revert_dictionary(
    clearbit_to_apadua_industries
)


apadua_industry_to_index = {
    "Aerospace & Defense": 0,
    "Automotive": 1,
    "Banking": 2,
    "Chemicals, Pharma & Biotech": 3,
    "Construction & Infrastructure": 4,
    "Consumer Goods": 5,
    "Electronics & Hardware": 6,
    "Energy & Renewables": 7,
    "Food & Beverages": 8,
    "Gastronomy, Tourism & Leisure": 9,
    "Healthcare": 10,
    "High Tech": 11,
    "Industrial Manufacturing": 12,
    "Insurance": 13,
    "Machinery & Equipment": 14,
    "Medical Equipment": 15,
    "Mining": 16,
    "Oil & Gas": 17,
    "Packaging": 18,
    "Paper": 19,
    "Payments": 20,
    "Public Sector": 21,
    "Real Estate": 22,
    "Retail & Wholesale": 23,
    "Services": 24,
    "Software & Internet": 25,
    "Telecommunication": 26,
    "Textile Goods & Fashion": 27,
    "Transport & Logistics": 28,
    "Utilities": 29,
}

index_to_apadua_industry = {v: k for (k, v) in apadua_industry_to_index.items()}


URL_PRESELECTION_TAGS_MULTILANG = JSONLoader(
    filepath="text_processors/tests/url_preselection_tags_multilang.json",
).load()

URL_PRESELECTION_TAGS_MULTILANG_FLAT_LIST = flatten(
    [mapping.values() for mapping in URL_PRESELECTION_TAGS_MULTILANG.values()]
)

URL_PRESELECTION_TAGS_EN = list(URL_PRESELECTION_TAGS_MULTILANG.keys())
