EOS/src/akkudoktoreos/prediction/weatherclearoutside.py

"""Weather Forecast.

This module provides classes and methods to retrieve, manage, and process weather forecast data
from various online sources. It includes structured representations of weather data and utilities
for fetching forecasts for specific locations and time ranges. By integrating multiple data sources,
the module enables flexible access to weather information based on latitude, longitude, and
desired time periods.

Notes:
    - Supported weather sources can be expanded by adding new fetch methods within the
      WeatherForecast class.
    - Ensure appropriate API keys or configurations are set up if required by external data sources.
"""

import re
from typing import Dict, List, Optional, Tuple

import pandas as pd
import requests
from bs4 import BeautifulSoup

from akkudoktoreos.core.logging import get_logger
from akkudoktoreos.prediction.weatherabc import WeatherDataRecord, WeatherProvider
from akkudoktoreos.utils.cacheutil import cache_in_file
from akkudoktoreos.utils.datetimeutil import to_datetime, to_duration, to_timezone

logger = get_logger(__name__)


WheaterDataClearOutsideMapping: List[Tuple[str, Optional[str], Optional[float]]] = [
    # clearoutside_key, description, corr_factor
    ("DateTime", "DateTime", None),
    ("Total Clouds (% Sky Obscured)", "Total Clouds (% Sky Obscured)", 1),
    ("Low Clouds (% Sky Obscured)", "Low Clouds (% Sky Obscured)", 1),
    ("Medium Clouds (% Sky Obscured)", "Medium Clouds (% Sky Obscured)", 1),
    ("High Clouds (% Sky Obscured)", "High Clouds (% Sky Obscured)", 1),
    ("ISS Passover", None, None),
    ("Visibility (miles)", "Visibility (m)", 1609.34),
    ("Fog (%)", "Fog (%)", 1),
    ("Precipitation Type", "Precipitation Type", None),
    ("Precipitation Probability (%)", "Precipitation Probability (%)", 1),
    ("Precipitation Amount (mm)", "Precipitation Amount (mm)", 1),
    ("Wind Speed (mph)", "Wind Speed (kmph)", 1.60934),
    ("Chance of Frost", "Chance of Frost", None),
    ("Temperature (°C)", "Temperature (°C)", 1),
    ("Feels Like (°C)", "Feels Like (°C)", 1),
    ("Dew Point (°C)", "Dew Point (°C)", 1),
    ("Relative Humidity (%)", "Relative Humidity (%)", 1),
    ("Pressure (mb)", "Pressure (mb)", 1),
    ("Ozone (du)", "Ozone (du)", 1),
    # Extra extraction
    ("Wind Direction (°)", "Wind Direction (°)", 1),
    # Generated from above
    ("Preciptable Water (cm)", "Preciptable Water (cm)", 1),
    ("Global Horizontal Irradiance (W/m2)", "Global Horizontal Irradiance (W/m2)", 1),
    ("Direct Normal Irradiance (W/m2)", "Direct Normal Irradiance (W/m2)", 1),
    ("Diffuse Horizontal Irradiance (W/m2)", "Diffuse Horizontal Irradiance (W/m2)", 1),
]
"""Mapping of ClearOutside weather data keys to WeatherDataRecord field description.

A list of tuples: (ClearOutside key, field description, correction factor).
"""


class WeatherClearOutside(WeatherProvider):
    """Retrieves and processes weather forecast data from ClearOutside.

    WeatherClearOutside is a thread-safe singleton, ensuring only one instance of this class is created.

    Attributes:
        prediction_hours (int, optional): The number of hours into the future for which predictions are generated.
        prediction_historic_hours (int, optional): The number of past hours for which historical data is retained.
        latitude (float, optional): The latitude in degrees, must be within -90 to 90.
        longitude (float, optional): The longitude in degrees, must be within -180 to 180.
        start_datetime (datetime, optional): The starting datetime for predictions, defaults to the current datetime if unspecified.
        end_datetime (datetime, computed): The datetime representing the end of the prediction range,
            calculated based on `start_datetime` and `prediction_hours`.
        keep_datetime (datetime, computed): The earliest datetime for retaining historical data, calculated
            based on `start_datetime` and `prediction_historic_hours`.
    """

    @classmethod
    def provider_id(cls) -> str:
        return "ClearOutside"

    @cache_in_file(with_ttl="1 hour")
    def _request_forecast(self) -> requests.Response:
        """Requests weather forecast from ClearOutside.

        Returns:
            response: Weather forecast request response from ClearOutside.
        """
        source = "https://clearoutside.com/forecast"
        latitude = round(self.config.latitude, 2)
        longitude = round(self.config.longitude, 2)
        response = requests.get(f"{source}/{latitude}/{longitude}?desktop=true")
        response.raise_for_status()  # Raise an error for bad responses
        logger.debug(f"Response from {source}: {response}")
        # We are working on fresh data (no cache), report update time
        self.update_datetime = to_datetime(in_timezone=self.config.timezone)
        return response

    def _update_data(self, force_update: Optional[bool] = None) -> None:
        """Scrape weather forecast data from ClearOutside's website.

        This method requests weather forecast data from ClearOutside based on latitude
        and longitude, then processes and structures this data for further use in analysis.

        The forecast data includes a variety of weather parameters such as cloud cover, temperature,
        humidity, visibility, precipitation, wind speed, and additional irradiance values
        calculated using the cloud cover data.

        Raises:
            ValueError: If the HTML structure of ClearOutside's website changes, causing
            extraction issues with forecast dates, timezone, or expected data sections.

        Note:
            - The function partly builds on code from https://github.com/davidusb-geek/emhass/blob/master/src/emhass/forecast.py (MIT License).
            - Uses `pvlib` to estimate irradiance (GHI, DNI, DHI) based on cloud cover data.

        Workflow:
            1. **Retrieve Web Content**: Uses a helper method to fetch or retrieve cached ClearOutside HTML content.
            2. **Extract Forecast Date and Timezone**:
                - Parses the forecast's start and end dates and the UTC offset from the "Generated" header.
            3. **Extract Weather Data**:
                - For each day in the 7-day forecast, the function finds detailed weather parameters
                and associates values for each hour.
                - Parameters include cloud cover, temperature, humidity, visibility, and precipitation type, among others.
            4. **Irradiance Calculation**:
                - Calculates irradiance (GHI, DNI, DHI) values using cloud cover data and the `pvlib` library.
            5. **Store Data**:
                - Combines all hourly data into `WeatherDataRecord` objects, with keys
                standardized according to `WeatherDataRecord` attributes.
        """
        # Get ClearOutside web content - either from site or cached
        response = self._request_forecast(force_update=force_update)  # type: ignore

        # Scrape the data
        soup = BeautifulSoup(response.content, "html.parser")

        # Find generation data
        p_generated = soup.find("h2", string=lambda text: text and text.startswith("Generated:"))
        if not p_generated:
            error_msg = f"Clearoutside schema change. Could not get '<h2>Generated:', got {p_generated} from {str(response.content)}."
            logger.error(error_msg)
            raise ValueError(error_msg)
        # Extract forecast start and end dates
        forecast_pattern = r"Forecast: (\d{2}/\d{2}/\d{2}) to (\d{2}/\d{2}/\d{2})"
        forecast_match = re.search(forecast_pattern, p_generated.get_text())
        if forecast_match:
            forecast_start_date = forecast_match.group(1)
            forecast_end_date = forecast_match.group(2)
        else:
            error_msg = f"Clearoutside schema change. Could not extract forecast start and end dates from {p_generated}."
            logger.error(error_msg)
            raise ValueError(error_msg)

        # Extract timezone offset
        timezone_pattern = r"Timezone: UTC([+-]\d+)\.(\d+)"
        timezone_match = re.search(timezone_pattern, p_generated.get_text())
        if timezone_match:
            hours = int(timezone_match.group(1))
            # Convert the decimal part to minutes (e.g., .50 -> 30 minutes)
            minutes = int(timezone_match.group(2)) * 6  # Multiply by 6 to convert to minutes

            # Create the timezone object using offset
            utc_offset = float(hours) + float(minutes) / 60.0
            forecast_timezone = to_timezone(utc_offset=utc_offset)
        else:
            error_msg = "Clearoutside schema change. Could not extract forecast timezone."
            logger.error(error_msg)
            raise ValueError(error_msg)

        forecast_start_datetime = to_datetime(
            forecast_start_date, in_timezone=forecast_timezone, to_maxtime=False
        )

        # Get key mapping from description
        clearoutside_key_mapping: Dict[str, Tuple[Optional[str], Optional[float]]] = {}
        for clearoutside_key, description, corr_factor in WheaterDataClearOutsideMapping:
            if description is None:
                clearoutside_key_mapping[clearoutside_key] = (None, None)
                continue
            weatherdata_key = WeatherDataRecord.key_from_description(description)
            if weatherdata_key is None:
                # Should not happen
                error_msg = f"No WeatherDataRecord key for '{description}'"
                logger.error(error_msg)
                raise ValueError(error_msg)
            clearoutside_key_mapping[clearoutside_key] = (weatherdata_key, corr_factor)

        # Find all paragraphs with id 'day_<x>'. There should be seven.
        p_days = soup.find_all(id=re.compile(r"day_[0-9]"))
        if len(p_days) != 7:
            error_msg = f"Clearoutside schema change. Found {len(p_days)} day tables, expected 7."
            logger.error(error_msg)
            raise ValueError(error_msg)

        # Delete all records that will be newly added
        self.delete_by_datetime(start_datetime=forecast_start_datetime)

        # Collect weather data, loop over all days
        for day, p_day in enumerate(p_days):
            # Within day_x paragraph find the details labels
            p_detail_labels = p_day.find_all(class_="fc_detail_label")
            detail_names = [p.get_text() for p in p_detail_labels]

            # Check for schema changes
            if len(detail_names) < 18:
                error_msg = f"Clearoutside schema change. Unexpected number ({len(detail_names)}) of `fc_detail_label`."
                logger.error(error_msg)
                raise ValueError(error_msg)
            for detail_name in detail_names:
                if detail_name not in clearoutside_key_mapping:
                    warning_msg = (
                        f"Clearoutside schema change. Unexpected detail name {detail_name}."
                    )
                    logger.warning(warning_msg)

            # Find all the paragraphs that are associated to the details.
            # Beware there is one ul paragraph before that is not associated to a detail
            p_detail_tables = p_day.find_all("ul")
            if len(p_detail_tables) != len(detail_names) + 1:
                error_msg = f"Clearoutside schema change. Unexpected number ({p_detail_tables}) of `ul` for details {len(detail_names)}. Should be one extra only."
                logger.error(error_msg)
                raise ValueError(error_msg)
            p_detail_tables.pop(0)

            # Create clearout data
            clearout_data = {}
            # Replace some detail names that we use differently
            detail_names = [
                s.replace("Wind Speed/Direction (mph)", "Wind Speed (mph)") for s in detail_names
            ]
            # Number of detail values. On last day may be less than 24.
            detail_values_count = None
            # Add data values
            scrape_detail_names = detail_names.copy()  # do not change list during iteration!
            for i, detail_name in enumerate(scrape_detail_names):
                p_detail_values = p_detail_tables[i].find_all("li")

                # Assure the number of values fits
                p_detail_values_count = len(p_detail_values)
                if (day == 6 and p_detail_values_count > 24) or (
                    day < 6 and p_detail_values_count != 24
                ):
                    error_msg = f"Clearoutside schema change. Unexpected number ({p_detail_values_count}) of `li` for detail `{detail_name}` data. Should be 24 or less on day 7. Table is `{p_detail_tables[i]}`."
                    logger.error(error_msg)
                    raise ValueError(error_msg)
                if detail_values_count is None:
                    # Remember detail values count only once
                    detail_values_count = p_detail_values_count
                if p_detail_values_count != detail_values_count:
                    # Value count for details differ.
                    error_msg = f"Clearoutside schema change. Number ({p_detail_values_count}) of `li` for detail `{detail_name}` data is different than last one {detail_values_count}. Table is `{p_detail_tables[i]}`."
                    logger.error(error_msg)
                    raise ValueError(error_msg)

                # Scrape the detail values
                detail_data = []
                extra_detail_name = None
                extra_detail_data = []
                for p_detail_value in p_detail_values:
                    if detail_name == "Wind Speed (mph)":
                        # Get the  usual value
                        value_str = p_detail_value.get_text()
                        # Also extract extra data
                        extra_detail_name = "Wind Direction (°)"
                        extra_value = None
                        match = re.search(r"(\d+)°", str(p_detail_value))
                        if match:
                            extra_value = float(match.group(1))
                        else:
                            error_msg = f"Clearoutside schema change. Can't extract direction angle from `{p_detail_value}` for detail `{extra_detail_name}`. Table is `{p_detail_tables[i]}`."
                            logger.error(error_msg)
                            raise ValueError(error_msg)
                        extra_detail_data.append(extra_value)
                    elif (
                        detail_name in ("Precipitation Type", "Chance of Frost")
                        and hasattr(p_detail_value, "title")
                        and p_detail_value.title
                    ):
                        value_str = p_detail_value.title.string
                    else:
                        value_str = p_detail_value.get_text()
                    try:
                        value = float(value_str)
                    except ValueError:
                        value = value_str
                    detail_data.append(value)
                clearout_data[detail_name] = detail_data
                if extra_detail_name:
                    if extra_detail_name not in detail_names:
                        detail_names.append(extra_detail_name)
                    clearout_data[extra_detail_name] = extra_detail_data
                    logger.debug(f"Added extra data {extra_detail_name} with {extra_detail_data}")

            # Add datetimes of the scrapped data
            clearout_data["DateTime"] = [
                forecast_start_datetime + to_duration(f"{day} days {i} hours")
                for i in range(0, detail_values_count)  # type: ignore[arg-type]
            ]
            detail_names.append("DateTime")

            # Converting the cloud cover into Irradiance (GHI, DNI, DHI)
            cloud_cover = pd.Series(
                data=clearout_data["Total Clouds (% Sky Obscured)"], index=clearout_data["DateTime"]
            )
            ghi, dni, dhi = self.estimate_irradiance_from_cloud_cover(
                self.config.latitude, self.config.longitude, cloud_cover
            )

            # Add GHI, DNI, DHI to clearout data
            clearout_data["Global Horizontal Irradiance (W/m2)"] = ghi
            detail_names.append("Global Horizontal Irradiance (W/m2)")
            clearout_data["Direct Normal Irradiance (W/m2)"] = dni
            detail_names.append("Direct Normal Irradiance (W/m2)")
            clearout_data["Diffuse Horizontal Irradiance (W/m2)"] = dhi
            detail_names.append("Diffuse Horizontal Irradiance (W/m2)")

            # Add Preciptable Water (PWAT) with a PVLib method.
            clearout_data["Preciptable Water (cm)"] = self.estimate_preciptable_water(
                pd.Series(data=clearout_data["Temperature (°C)"]),
                pd.Series(data=clearout_data["Relative Humidity (%)"]),
            ).to_list()
            detail_names.append("Preciptable Water (cm)")

            # Add weather data
            # Add the records from clearout
            for row_index in range(0, len(clearout_data["DateTime"])):
                weather_record = WeatherDataRecord()
                for detail_name in detail_names:
                    key = clearoutside_key_mapping[detail_name][0]
                    if key is None:
                        continue
                    if detail_name in clearout_data:
                        value = clearout_data[detail_name][row_index]
                        corr_factor = clearoutside_key_mapping[detail_name][1]
                        if corr_factor:
                            value = value * corr_factor
                        setattr(weather_record, key, value)
                self.insert_by_datetime(weather_record)