Source code for program_files.preprocessing.data_preparation_algorithms.k_means_medoids

"""
    Christian Klemm - christian.klemm@fh-muenster.de
"""
import pandas
import numpy as np
import logging
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from program_files.preprocessing.data_preparation \
    import calculate_cluster_means, append_timeseries_to_weatherdata_sheet,\
    variable_costs_date_adaption, extract_single_periods, \
    timeseries_adaption


[docs]def calculate_k_means_clusters(cluster_number: int,
                               weather_data: pandas.DataFrame,
                               cluster_criterion: str, period: str
                               ) -> np.array:
    """
        Applies the k-means algorithm to a list of day-weather-vectors.
        Caution: weather data set must be available in hourly resolution!

        :param cluster_number: Number of k-mean-clusters
        :type cluster_number: int
        :param weather_data: weather_data, the clusters should be applied to
        :type weather_data: pandas.DataFrame
        :param cluster_criterion: weather_parameter/column name which
            should be applied as cluster criterion
        :type cluster_criterion: str
        :param period: defines rather days or weeks were selected
        :type period: str

        :return: - **model.labels_** (np.array) - Chronological list, \
            which days of the weather data set belongs to which cluster

    """
    cluster_vectors = extract_single_periods(data_set=weather_data,
                                             column_name=cluster_criterion,
                                             period=period)
    kmeans = KMeans(n_clusters=cluster_number)
    model = kmeans.fit(cluster_vectors)
    return model.labels_


[docs]def calculate_k_medoids_clusters(cluster_number: int,
                                 weather_data: pandas.DataFrame,
                                 cluster_criterion: str, period: str
                                 ) -> np.array:
    """
        Applies the k-medoids algorithm to a list of
        day-weather-vectors. Caution: weather data set must be
        available in hourly resolution!

        :param cluster_number: Number of k-mean-clusters
        :type cluster_number: int
        :param weather_data: weather_data, the clusters should be \
            applied to
        :type weather_data: pandas.DataFrame
        :param cluster_criterion: weather_parameter/column name which \
            should be applied as cluster criterion
        :type cluster_criterion: str
        :param period: defines rather days or weeks were selected
        :type period: str

        :return: - **model.labels_** (np.array) - Chronological list, \
            which days of the weather data set belongs to which cluster

    """
    cluster_vectors = extract_single_periods(data_set=weather_data,
                                             column_name=cluster_criterion,
                                             period=period)
    kmedoids = KMedoids(n_clusters=cluster_number)
    model = kmedoids.fit(cluster_vectors)
    return model.labels_


[docs]def k_medoids_timeseries_adaption(nodes_data: dict, clusters: int,
                                  cluster_labels, period: str) -> None:
    """
        Identifies k-cluster periods from the original timeseries and merges
        them to a new set of timeseries. A new consecutive time index, starting
        with the same date as the original dataset is assigned to the merged
        timeseries.

        :param nodes_data: system parameters
        :type nodes_data: dict
        :param clusters: Number of clusters
        :type clusters: int
        :param cluster_labels: Chronological list, which days of the weather
                               data set belongs to which cluster

        :type cluster_labels: np.array
        :param period: defines rather hours, days or weeks were selected
        :type period: str

    """
    prep_timeseries = \
        calculate_cluster_means(data_set=nodes_data['timeseries'].copy(),
                                cluster_number=clusters,
                                cluster_labels=cluster_labels,
                                period=period)
    clusters = len(nodes_data["timeseries"]) // len(nodes_data["weather data"])
    # Rename columns of the new timeseries-dataset
    if period == 'hours':
        prep_timeseries['timestamp'] = \
            nodes_data['timeseries']['timestamp'][
            :int(len(nodes_data['timeseries']))]
    elif period == 'days':
        prep_timeseries['timestamp'] = \
            nodes_data['timeseries']['timestamp'][
            :int(len(nodes_data['timeseries']) / clusters)]
    elif period == 'weeks':
        prep_timeseries['timestamp'] = \
            nodes_data['timeseries']['timestamp'][
            :int(len(nodes_data['timeseries']))]  # / (clusters * 7))]
    nodes_data['timeseries'] = prep_timeseries.copy()
    nodes_data['weather data'] = prep_timeseries.copy()


[docs]def k_means_algorithm(cluster_period: int, days_per_cluster: int,
                      criterion: str, nodes_data: dict, period: str) -> None:
    """
        identifies k-cluster periods based on the k-means algorithm
        based on a given criteria. Based on the selected periods, for
        all timeseries weather data, the respective periods are
        identified and merged to new shortened timeseries with
        consecutive time-indices which start with the same start date
        as the original timeseries. Afterwards, all variable costs are
        multiplied by the shortening factor (variable cost factor) of
        the time-series to ensure the same ratio between variable and
        periodical costs for the energy system optimization model in
        which the time-series will be applied.
        
        :param cluster_period: contains the gui input of the chosen \
            period type (possible entries: days, weeks)
        :type cluster_period: str
        :param days_per_cluster: contains the gui input of the chosen \
            index (possible entries: 1 - 365)
        :type days_per_cluster: int
        :param criterion: criterion chosen for k_mean algorithm
        :type criterion: str
        :param nodes_data: dictionary containing the excel worksheets \
            from the used model definition workbook
        :type nodes_data: dict
        :param period: defines rather days or weeks were selected
        :type period: str
        
        :raise: - **ValueError** - Error raised if the chosen period \
            is not supported
    """
    if cluster_period == 'days':
        clusters = 365 // int(days_per_cluster)
    elif cluster_period == 'weeks':
        clusters = 52 // int(days_per_cluster)
    else:
        raise ValueError("period chosen not possible")
    # Merge the timeseries and weather data sets, so that that all timeseries'
    # get clustered within one step
    nodes_data['timeseries'] = append_timeseries_to_weatherdata_sheet(
        nodes_data)
    weather_data = nodes_data['timeseries'].copy()
    
    # # depending on the chosen criterion rather the timeseries or the
    # # weather data sheet is selected for the following preparation
    # if criterion == 'el_demand_sum' or criterion == 'heat_demand_sum':
    #     weather_data = nodes_data['timeseries'].copy()
    # else:
    #     weather_data = nodes_data['weather data'].copy()
    
    # Calculate k-mean clusters, based on the cluster_criterion
    cluster_labels = calculate_k_means_clusters(cluster_number=clusters,
                                                weather_data=weather_data,
                                                cluster_criterion=criterion,
                                                period=period)
    
    weather_data = nodes_data['weather data'].copy()
    
    # Apply the Clusters to the entire weather_dataset
    prep_weather_data = calculate_cluster_means(data_set=weather_data,
                                                cluster_number=clusters,
                                                cluster_labels=cluster_labels,
                                                period=period)
    
    # Rename columns of the new weather_dataset
    prep_weather_data['timestamp'] = \
        weather_data['timestamp'][:len(prep_weather_data)]
    # Replaces the weather data set in nodes_data by the new one
    nodes_data['weather data'] = prep_weather_data
    
    # Adapts Other Parameters (despite weather data) of the energy system
    variable_costs_date_adaption(nodes_data, clusters, period)
    timeseries_adaption(nodes_data, clusters, cluster_labels, period)
    
    
[docs]def k_medoids_algorithm(cluster_period: int, days_per_cluster: int,
                        criterion: str, nodes_data: dict, period: str) -> None:
    """
        identifies k-cluster periods based on the k-medoids algorithm
        based on a given criteria. Based on the selected periods, for
        all timeseries weather data, the respective periods are
        identified and merged to new shortened timeseries with
        consecutive time-indices which start with the same start date
        as the original timeseries. Afterwards, all variable costs are
        multiplied by the shortening factor (variable cost factor) of
        the time-series to ensure the same ratio between variable and
        periodical costs for the energy system optimization model in
        which the time-series will be applied.
        
        :param cluster_period: contains the gui input of the chosen \
            period type (possible entries: hours, days, weeks)
        :type cluster_period: str
        :param days_per_cluster: contains the gui input of the chosen \
            index (possible entries: 1 - 365)
        :type days_per_cluster: int
        :param criterion: criterion chosen for k_mean algorithm
        :type criterion: str
        :param nodes_data: dictionary containing the excel worksheets \
            from the used model definition workbook
        :type nodes_data: dict
        :param period: defines rather days or weeks were selected
        :type period: str
        
        :raise: - **ValueError** - Error raised if the chosen period \
            is not supported
    """
    if cluster_period == 'days':
        clusters = 365 // int(days_per_cluster)
        logging.info('days per cluster: ' + str(days_per_cluster))
        logging.info('clusters: ' + str(clusters))
    elif cluster_period == 'weeks':
        clusters = 52 // int(days_per_cluster)
        logging.info('days per cluster: ' + str(days_per_cluster))
        logging.info('clusters: ' + str(clusters))
    else:
        raise ValueError("period chosen not possible")
    # Merge the timeseries and weather data sets, sothat that all timeseries'
    # get clustered within one step
    nodes_data['timeseries'] = append_timeseries_to_weatherdata_sheet(
        nodes_data=nodes_data)

    weather_data = nodes_data['timeseries'].copy()

    # Calculate k-medoids clusters, based on the cluster_criterion
    cluster_labels = calculate_k_medoids_clusters(cluster_number=clusters,
                                                  weather_data=weather_data,
                                                  cluster_criterion=criterion,
                                                  period=period)

    weather_data = nodes_data['timeseries'].copy()
    nodes_data['weather data'] = weather_data
    nodes_data['timeseries'] = weather_data

    # Adapts Other Parameters (despite weather data) of the energy system
    variable_costs_date_adaption(nodes_data, clusters, period)

    k_medoids_timeseries_adaption(nodes_data, clusters,
                                  cluster_labels, period)