Source code for program_files.preprocessing.data_preparation_algorithms.k_means_medoids

"""
    Christian Klemm - christian.klemm@fh-muenster.de
"""
import pandas
import numpy as np
import logging
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from program_files.preprocessing.data_preparation \
    import calculate_cluster_means, append_timeseries_to_weatherdata_sheet,\
    variable_costs_date_adaption, extract_single_periods, \
    timeseries_adaption


[docs]def calculate_k_means_clusters(cluster_number: int, weather_data: pandas.DataFrame, cluster_criterion: str, period: str ) -> np.array: """ Applies the k-means algorithm to a list of day-weather-vectors. Caution: weather data set must be available in hourly resolution! :param cluster_number: Number of k-mean-clusters :type cluster_number: int :param weather_data: weather_data, the clusters should be applied to :type weather_data: pandas.DataFrame :param cluster_criterion: weather_parameter/column name which should be applied as cluster criterion :type cluster_criterion: str :param period: defines rather days or weeks were selected :type period: str :return: - **model.labels_** (np.array) - Chronological list, \ which days of the weather data set belongs to which cluster """ cluster_vectors = extract_single_periods(data_set=weather_data, column_name=cluster_criterion, period=period) kmeans = KMeans(n_clusters=cluster_number) model = kmeans.fit(cluster_vectors) return model.labels_
[docs]def calculate_k_medoids_clusters(cluster_number: int, weather_data: pandas.DataFrame, cluster_criterion: str, period: str ) -> np.array: """ Applies the k-medoids algorithm to a list of day-weather-vectors. Caution: weather data set must be available in hourly resolution! :param cluster_number: Number of k-mean-clusters :type cluster_number: int :param weather_data: weather_data, the clusters should be \ applied to :type weather_data: pandas.DataFrame :param cluster_criterion: weather_parameter/column name which \ should be applied as cluster criterion :type cluster_criterion: str :param period: defines rather days or weeks were selected :type period: str :return: - **model.labels_** (np.array) - Chronological list, \ which days of the weather data set belongs to which cluster """ cluster_vectors = extract_single_periods(data_set=weather_data, column_name=cluster_criterion, period=period) kmedoids = KMedoids(n_clusters=cluster_number) model = kmedoids.fit(cluster_vectors) return model.labels_
[docs]def k_medoids_timeseries_adaption(nodes_data: dict, clusters: int, cluster_labels, period: str) -> None: """ Identifies k-cluster periods from the original timeseries and merges them to a new set of timeseries. A new consecutive time index, starting with the same date as the original dataset is assigned to the merged timeseries. :param nodes_data: system parameters :type nodes_data: dict :param clusters: Number of clusters :type clusters: int :param cluster_labels: Chronological list, which days of the weather data set belongs to which cluster :type cluster_labels: np.array :param period: defines rather hours, days or weeks were selected :type period: str """ prep_timeseries = \ calculate_cluster_means(data_set=nodes_data['timeseries'].copy(), cluster_number=clusters, cluster_labels=cluster_labels, period=period) clusters = len(nodes_data["timeseries"]) // len(nodes_data["weather data"]) # Rename columns of the new timeseries-dataset if period == 'hours': prep_timeseries['timestamp'] = \ nodes_data['timeseries']['timestamp'][ :int(len(nodes_data['timeseries']))] elif period == 'days': prep_timeseries['timestamp'] = \ nodes_data['timeseries']['timestamp'][ :int(len(nodes_data['timeseries']) / clusters)] elif period == 'weeks': prep_timeseries['timestamp'] = \ nodes_data['timeseries']['timestamp'][ :int(len(nodes_data['timeseries']))] # / (clusters * 7))] nodes_data['timeseries'] = prep_timeseries.copy() nodes_data['weather data'] = prep_timeseries.copy()
[docs]def k_means_algorithm(cluster_period: int, days_per_cluster: int, criterion: str, nodes_data: dict, period: str) -> None: """ identifies k-cluster periods based on the k-means algorithm based on a given criteria. Based on the selected periods, for all timeseries weather data, the respective periods are identified and merged to new shortened timeseries with consecutive time-indices which start with the same start date as the original timeseries. Afterwards, all variable costs are multiplied by the shortening factor (variable cost factor) of the time-series to ensure the same ratio between variable and periodical costs for the energy system optimization model in which the time-series will be applied. :param cluster_period: contains the gui input of the chosen \ period type (possible entries: days, weeks) :type cluster_period: str :param days_per_cluster: contains the gui input of the chosen \ index (possible entries: 1 - 365) :type days_per_cluster: int :param criterion: criterion chosen for k_mean algorithm :type criterion: str :param nodes_data: dictionary containing the excel worksheets \ from the used model definition workbook :type nodes_data: dict :param period: defines rather days or weeks were selected :type period: str :raise: - **ValueError** - Error raised if the chosen period \ is not supported """ if cluster_period == 'days': clusters = 365 // int(days_per_cluster) elif cluster_period == 'weeks': clusters = 52 // int(days_per_cluster) else: raise ValueError("period chosen not possible") # Merge the timeseries and weather data sets, so that that all timeseries' # get clustered within one step nodes_data['timeseries'] = append_timeseries_to_weatherdata_sheet( nodes_data) weather_data = nodes_data['timeseries'].copy() # # depending on the chosen criterion rather the timeseries or the # # weather data sheet is selected for the following preparation # if criterion == 'el_demand_sum' or criterion == 'heat_demand_sum': # weather_data = nodes_data['timeseries'].copy() # else: # weather_data = nodes_data['weather data'].copy() # Calculate k-mean clusters, based on the cluster_criterion cluster_labels = calculate_k_means_clusters(cluster_number=clusters, weather_data=weather_data, cluster_criterion=criterion, period=period) weather_data = nodes_data['weather data'].copy() # Apply the Clusters to the entire weather_dataset prep_weather_data = calculate_cluster_means(data_set=weather_data, cluster_number=clusters, cluster_labels=cluster_labels, period=period) # Rename columns of the new weather_dataset prep_weather_data['timestamp'] = \ weather_data['timestamp'][:len(prep_weather_data)] # Replaces the weather data set in nodes_data by the new one nodes_data['weather data'] = prep_weather_data # Adapts Other Parameters (despite weather data) of the energy system variable_costs_date_adaption(nodes_data, clusters, period) timeseries_adaption(nodes_data, clusters, cluster_labels, period)
[docs]def k_medoids_algorithm(cluster_period: int, days_per_cluster: int, criterion: str, nodes_data: dict, period: str) -> None: """ identifies k-cluster periods based on the k-medoids algorithm based on a given criteria. Based on the selected periods, for all timeseries weather data, the respective periods are identified and merged to new shortened timeseries with consecutive time-indices which start with the same start date as the original timeseries. Afterwards, all variable costs are multiplied by the shortening factor (variable cost factor) of the time-series to ensure the same ratio between variable and periodical costs for the energy system optimization model in which the time-series will be applied. :param cluster_period: contains the gui input of the chosen \ period type (possible entries: hours, days, weeks) :type cluster_period: str :param days_per_cluster: contains the gui input of the chosen \ index (possible entries: 1 - 365) :type days_per_cluster: int :param criterion: criterion chosen for k_mean algorithm :type criterion: str :param nodes_data: dictionary containing the excel worksheets \ from the used model definition workbook :type nodes_data: dict :param period: defines rather days or weeks were selected :type period: str :raise: - **ValueError** - Error raised if the chosen period \ is not supported """ if cluster_period == 'days': clusters = 365 // int(days_per_cluster) logging.info('days per cluster: ' + str(days_per_cluster)) logging.info('clusters: ' + str(clusters)) elif cluster_period == 'weeks': clusters = 52 // int(days_per_cluster) logging.info('days per cluster: ' + str(days_per_cluster)) logging.info('clusters: ' + str(clusters)) else: raise ValueError("period chosen not possible") # Merge the timeseries and weather data sets, sothat that all timeseries' # get clustered within one step nodes_data['timeseries'] = append_timeseries_to_weatherdata_sheet( nodes_data=nodes_data) weather_data = nodes_data['timeseries'].copy() # Calculate k-medoids clusters, based on the cluster_criterion cluster_labels = calculate_k_medoids_clusters(cluster_number=clusters, weather_data=weather_data, cluster_criterion=criterion, period=period) weather_data = nodes_data['timeseries'].copy() nodes_data['weather data'] = weather_data nodes_data['timeseries'] = weather_data # Adapts Other Parameters (despite weather data) of the energy system variable_costs_date_adaption(nodes_data, clusters, period) k_medoids_timeseries_adaption(nodes_data, clusters, cluster_labels, period)