#!/usr/bin/env python3

import argparse
import json
import os
import re
import logging
from bs4 import BeautifulSoup, NavigableString
from colorama import Fore, Style, init

init(autoreset=True)

class HTMLTextExtractor:
    """Extracts text from an HTML file."""

    def __init__(self, html_file):
        self.html_file = html_file

    def extract_text(self):
        """Extracts text from the HTML file."""
        if not os.path.isfile(self.html_file):
            raise FileNotFoundError(f"The file {self.html_file} does not exist.")
        
        with open(self.html_file, "r", encoding="utf8") as file:
            contents = file.read()

        if not contents:
            raise ValueError(f"The file {self.html_file} is empty or cannot be read.")

        soup = BeautifulSoup(contents, 'lxml')
        texts = []
        for tag in ['title', 'span', 'a', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            for element in soup.find_all(tag):
                logging.info(f"Translating tag: {Fore.CYAN}{tag}{Style.RESET_ALL}")
                if element.contents:
                    for content in element.contents:
                        if isinstance(content, NavigableString):
                            text = re.sub(r'\s+', ' ', content).strip()
                            if text not in [' ', '\xa0', '.', '*']:
                                texts.append(text)
                                logging.info(f"Extracted text: {Fore.GREEN}{text}{Style.RESET_ALL}")
                else:
                    text = re.sub(r'\s+', ' ', element.text).strip()
                    if text not in [' ', '\xa0', '.', '*']:
                        texts.append(text)
                        logging.info(f"Extracted text: {Fore.GREEN}{text}{Style.RESET_ALL}")

        logging.info(f"Finished extracting text from {Fore.BLUE}{self.html_file}{Style.RESET_ALL}")
        return texts

class JSONConverter:
    """Converts the extracted text to JSON format and writes it to a file."""

    def __init__(self, texts, json_file, keep_missing):
        self.texts = texts
        self.json_file = json_file
        self.keep_missing = keep_missing

    def convert_to_json(self):
        """Converts the extracted text to JSON format and writes it to a file."""
        try:
            with open(self.json_file, "r", encoding="utf8") as file:
                existing_data = json.load(file)
        except (FileNotFoundError, json.JSONDecodeError):
            existing_data = {"translations": {}, "legacy": {}}

        if "translations" not in existing_data and "legacy" not in existing_data:
            existing_data = {"translations": existing_data, "legacy": {}}

        existing_translations = {k.strip(): v.strip() for k, v in existing_data["translations"].items()}
        existing_legacy = {k.strip(): v.strip() if isinstance(v, str) else v for k, v in existing_data["legacy"].items()}

        new_translations = {}
        for text in self.texts:
            if text:
                translation = existing_translations.get(text, existing_legacy.get(text, ""))
                new_translations[text] = translation
                if translation:
                    logging.info(f"Found existing translation: {Fore.GREEN}{text} {Fore.WHITE}->{Fore.YELLOW} {translation}{Style.RESET_ALL}")
                else:
                    logging.info(f"No translation found for: {Fore.RED}{text}{Style.RESET_ALL}")

        if self.keep_missing:
            missing_translations = {k: v for k, v in existing_translations.items() if k not in new_translations}
            legacy_translations = {k: v for k, v in existing_legacy.items() if k not in new_translations}
            new_translations.update(missing_translations)
            legacy_translations.update(legacy_translations)  # keep all legacy translations
        else:
            missing_translations = {k: v for k, v in existing_translations.items() if k not in new_translations}
            legacy_translations = {k: v for k, v in existing_legacy.items() if k not in new_translations}
            legacy_translations.update(missing_translations)  # move missing translations to legacy

        for text in legacy_translations:
            logging.info(f"{Fore.MAGENTA}Legacy translation: {Fore.GREEN}{text} {Fore.MAGENTA}->{Fore.YELLOW} {legacy_translations[text]}{Style.RESET_ALL}")

        translation_data = {
            "_comment": "This is a generated JSON file. Modify with care.",
            "translations": new_translations,
            "legacy": legacy_translations
        }

        try:
            with open(self.json_file, "w", encoding="utf8") as file:
                json.dump(translation_data, file, ensure_ascii=False, indent=4)
        except IOError as e:
            raise IOError(f"Could not write to file {self.json_file}: {e}")

        logging.info(f"Saved translated text to {Fore.BLUE}{self.json_file}{Style.RESET_ALL}")

def main():
    """The main function that parses the command-line arguments and calls the appropriate functions."""
    parser = argparse.ArgumentParser(
        description="Translate HTML file using provided JSON translations.",
        formatter_class=argparse.RawTextHelpFormatter,
        epilog=(
            "Requirements:\n"
            "For Debian-based systems, install required packages with:\n"
            "    sudo apt-get install python3-bs4 python3-colorama\n"
            "For other systems, ensure the following Python modules are installed:\n"
            "    beautifulsoup4, colorama"
        )
    )
    parser.add_argument("-i", "--input", help="The HTML file to translate. The file must have an .html extension.", required=True)
    parser.add_argument("-o", "--output", help="The JSON file to store extracted text. The file must have a .json extension.", required=True)
    parser.add_argument("-k", "--keep-missing", help="Keep missing translations in the translations section instead of moving them to legacy.", action="store_true")
    parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true")

    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.INFO, format='%(message)s')

    if not args.input.lower().endswith('.html'):
        parser.error("The input file must have an .html extension.")
    
    if not args.output.lower().endswith('.json'):
        parser.error("The output file must have a .json extension.")

    try:
        extractor = HTMLTextExtractor(args.input)
        texts = extractor.extract_text()
        
        converter = JSONConverter(texts, args.output, args.keep_missing)
        converter.convert_to_json()
    except (FileNotFoundError, ValueError, IOError) as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()
