Source code for program_files.preprocessing.data_preparation_algorithms.heuristic_selection

"""
    Christian Klemm - christian.klemm@fh-muenster.de
"""
import datetime
import pandas
import os

from program_files.preprocessing.data_preparation \
    import variable_costs_date_adaption, append_timeseries_to_weatherdata_sheet


[docs]def hierarchical_selection(nodes_data: dict, scheme: str, period: str, seasons: int) -> None: """ Algorithm for the hierarchical selection of representative time periods of a weather data set. In this embodiment, the following representative periods are selected for every season (winter, spring, summer, fall) are selected: - Week containing the coldest temperature of the season - Week with the lowest average sun duration - Week containing the warmest temperature of the season - Week with the highest average sun duration :param nodes_data: SESMG-nodes data, containing weather data, \ energy system parameters and timeseries :type nodes_data: dict :param scheme: ID of heuristic selection scheme to be applied :type scheme: str :param period: specifies whether 'days' or 'weeks' are applied \ as heuristic selection reference periods :type period: str :param seasons: number of seasons for hierarchical selections, \ e.g. 12 for months or 4 for annual seasons :type seasons: int :return: - **nodes_data** (dict): modified SESMG-nodes data, \ containing weather data, energy system parameters \ and timeseries """ def extract_data_slices(data_set: pandas.DataFrame, timesteps: int ) -> list: """ extracts slices of a defined length of a dataset. E.g. slices of weeks (168 timesteps) of a weather data set may be extracted. :param data_set: Data set from which the slices are extracted :type data_set: pandas.DataFrame :param timesteps: length of the to extracted slices :type timesteps: int :return: - **list_of_data_slices** (list) - extracted \ slices of the given data set """ list_of_data_slices = [] for i in range(len(data_set) // timesteps): period_data_set = data_set[i * timesteps:(i + 1) * timesteps] list_of_data_slices.append(period_data_set) return list_of_data_slices def identify_timeseries_minimum(data_set: pandas.DataFrame, column_name: str) -> float: """ returns the minimum value of a certain column of a given data_set :param data_set: Data set from which the slices are extracted :type data_set: pandas.DataFrame :param column_name: column under investigation :type column_name: str :return: - **-** (float) - minimum value of a column """ return min(data_set[column_name]) def identify_minimum_week(data_set: pandas.DataFrame, criterion: str, value: str) -> pandas.DataFrame: """ Returns the week with a minimum value of a certain column. Either the week with the absolute minimum value, or the week with the average minimum value can be selected. :param data_set: Data set from which the slices are \ extracted :type data_set: pandas.DataFrame :param criterion: column, which is the criterion of the \ selection :type criterion: str :param value: 'extreme' for absolute minimum value, \ 'average' for average minimum value selection :type value: str :return: - **minimum_week** (pandas.Dataframe) - Dataset \ of the selected minimum week """ absolute_minimum = 99999999 for i in range(len(data_set)): if value == "extreme": weekly_minimum = identify_timeseries_minimum( data_set=data_set[i], column_name=criterion) elif value == "average": weekly_minimum = identify_timeseries_average( data_set=data_set[i], column_name=criterion) else: raise ValueError("value chosen not supported") # check if calc. minimum is lower than the one calc. before if weekly_minimum < absolute_minimum: absolute_minimum = weekly_minimum minimum_week = data_set[i] return minimum_week def identify_timeseries_maximum(data_set: pandas.DataFrame, column_name: str) -> float: """ returns the maximum value of a certain column of a given data_set :param data_set: Data set from which the slices are \ extracted :type data_set: pandas.DataFrame :param column_name: column under investigation :type column_name: str :return: - **-** (float) - maximum value of a column """ return max(data_set[column_name]) def identify_timeseries_average(data_set: pandas.DataFrame, column_name: str) -> float: """ returns the average value of a certain column of a given data_set. :param data_set: Data set from which the slices are \ extracted :type data_set: pandas.DataFrame :param column_name: column under investigation :type column_name: str :return: - **-** (float) - average of a column """ list = data_set[column_name] return 0 if len(list) == 0 else sum(list) / len(list) def identify_maximum_week(data_set: pandas.DataFrame, criterion: str, value: str) -> pandas.DataFrame: """ Returns the week with a maximum value of a certain column. Either the week with the absolute maximum value, or the week with the average maximum value can be selected. :param data_set: Data set from which the slices are \ extracted :type data_set: pandas.DataFrame :param criterion: column, which is the criterion of the selection :type criterion: str :param value: 'extreme' for absolute maximum value, 'average' for average maximum value selection :type value: str :return: - **maximum_week** (pandas.Dataframe) - Dataset \ of the selected maximum week """ absolute_maximum = -99999999 for i in range(len(data_set)): if value == "extreme": weekly_maximum = identify_timeseries_maximum( data_set=data_set[i], column_name=criterion) elif value == "average": weekly_maximum = identify_timeseries_average( data_set=data_set[i], column_name=criterion) else: raise ValueError("value chosen not supported") if weekly_maximum > absolute_maximum: absolute_maximum = weekly_maximum maximum_week = data_set[i] return maximum_week def identify_average_week(data_set: pandas.DataFrame, criterion: str) -> pandas.DataFrame: """ Returns the week with the most average series of a certain column. :param data_set: Data set from which the slices are \ extracted :type data_set: pandas.DataFrame :param criterion: column, which is the criterion of the selection :type criterion: str :return: - **average_data** (pandas.Dataframe) - Dataset \ of the selected average week """ # Creates a list with the average value of every week list_of_averages = [] for i in range(len(data_set)): weekly_average = identify_timeseries_average(data_set=data_set[i], column_name=criterion) list_of_averages.append(weekly_average) # Calculates the average of the entire dataset absolute_average = 0 if len(list_of_averages) == 0 \ else sum(list_of_averages) / len(list_of_averages) # Checks which average is closest to the absolute average deviation = 999999999999 for i in range(len(data_set)): if abs(list_of_averages[i] - absolute_average) <= deviation: deviation = abs(list_of_averages[i] - absolute_average) average_data = data_set[i] return average_data def reorder_weather_data() -> None: """ Reorder weather data set due to the meteorological beginning of winter on the 01.12. """ old_start_date = nodes_data['energysystem']['start date'][1] old_end_date = nodes_data['energysystem']['end date'][1] if int(old_end_date.day) == 30: nodes_data['energysystem'][ 'start date'] = datetime.datetime.strptime( str(int(old_start_date.year) - 1) + "-12-02 00:00:00", '%Y-%m-%d %H:%M:%S') else: nodes_data['energysystem'][ 'start date'] = datetime.datetime.strptime( str(int(old_start_date.year) - 1) + "-12-01 00:00:00", '%Y-%m-%d %H:%M:%S') nodes_data['energysystem']['end date'] = datetime.datetime.strptime( str(old_end_date.year) + "-11-30 23:00:00", '%Y-%m-%d %H:%M:%S') nodes_data["timeseries"] = append_timeseries_to_weatherdata_sheet( nodes_data) old_timeseries = nodes_data["timeseries"].copy() nodes_data["timeseries"] = old_timeseries[-30 * 24:] nodes_data["timeseries"] = nodes_data["timeseries"].append( old_timeseries[:-30 * 24]) for i in range(8040, 8759): nodes_data['timeseries'].loc[i, 'timestamp'] = \ nodes_data['timeseries']['timestamp'][i].replace( year=int(old_start_date.year - 1)) nodes_data["timeseries"].reset_index(inplace=True, drop=False) nodes_data["weather data"] = nodes_data["timeseries"].copy() def create_period_weather_data(period: str) -> list: """ Splits the weather data_set in nodes_data into weekly od daily weather data sets :param period: defines rather dayly or weekly weather \ data set is created :type period: str :return: **-** (list) - shortened weather \ data data set """ # Splits the given weather data_set in nodes_data into weekly # weather data sets if period == 'weeks': period_length = 24 * 7 elif period == 'days': period_length = 24 else: raise ValueError("Non supported period") return extract_data_slices(data_set=nodes_data['weather data'], timesteps=period_length) def create_period_season_weather_data(period_data_slices: list, seasons: int) -> list: """ Splits a given weather data (one year) set into weekly weather data slices and sorts them into lists of every season of the year (each season is defined by a length of 13 weeks) and returns a list, containing a list of weather data weeks for every season. :param period_data_slices: weather data already shortend \ to daily or weakly resolution :type period_data_slices: list :param seasons: number of seasons :type seasons: int :return: - **season_data** (list) - list, containing list \ of weekly weather data slices of every season. """ season_length = len(period_data_slices) // seasons # Sorts the weekly weather data sets into seasons. One season is # defined by 13 consecutive weeks here season_data = [] for i in range(seasons): periods_data = period_data_slices[season_length * i:season_length * (i + 1)] season_data.append(periods_data) return season_data def select_heuristic_periods(heuristic_periods: list, period_data_slices: list, season_data: list, seasons: int) -> pandas.DataFrame: """ Selects and returns representative values of time series according to a given heuristic scheme. :param heuristic_periods: defined heuristic periods to be \ selected :type heuristic_periods: list :param period_data_slices: list containing the periods \ data slices :type period_data_slices: list :param season_data: containing list of weekly weather data \ slices of every season. :type season_data: list :param seasons: number of seasons for hierarchical \ selections, e.g. 12 for months or 4 for annual seasons :type seasons: int :return - **prep_weather_data** (pandas.DataFrame) - \ dataframe containing the sampled weather data data frame """ prep_weather_data = pandas.DataFrame() if seasons == 4: for representative in heuristic_periods: if representative[0] == 'winter': data_set = season_data[0] elif representative[0] == 'spring': data_set = season_data[1] elif representative[0] == 'summer': data_set = season_data[2] elif representative[0] == 'fall': data_set = season_data[3] elif representative[0] == 'year': data_set = period_data_slices else: raise ValueError("Error") if representative[1] == 'lowest': selected_week = identify_minimum_week( data_set=data_set, criterion=representative[2], value=representative[3]) elif representative[1] == 'highest': selected_week = identify_maximum_week( data_set=data_set, criterion=representative[2], value=representative[3]) elif representative[1] == 'average': selected_week = identify_average_week( data_set=data_set, criterion=representative[2]) else: raise ValueError("Error") prep_weather_data = prep_weather_data.append(selected_week) elif seasons == 12: for representative in heuristic_periods: if representative[0] == 'year': data_set = period_data_slices else: data_set = season_data[int(representative[0]) - 1] if representative[1] == 'lowest': selected_week = identify_minimum_week( data_set=data_set, criterion=representative[2], value=representative[3]) elif representative[1] == 'highest': selected_week = identify_maximum_week( data_set=data_set, criterion=representative[2], value=representative[3]) elif representative[1] == 'average': selected_week = identify_average_week( data_set=data_set, criterion=representative[2]) else: raise ValueError("Error") prep_weather_data = prep_weather_data.append(selected_week) return prep_weather_data # get scheme path for heuristic selection from technical data folder scheme_path = \ os.path.join(os.path.dirname(os.path.dirname(__file__)), '..', 'technical_data/hierarchical_selection_schemes.xlsx') reorder_weather_data() period_data_slices = create_period_weather_data(period=period) season_data = create_period_season_weather_data( period_data_slices=period_data_slices, seasons=seasons) prep_weather_data = pandas.DataFrame() scheme_df = pandas.read_excel(scheme_path, sheet_name=str(scheme)) heuristic_periods = scheme_df.values.tolist() prep_weather_data = select_heuristic_periods( heuristic_periods=heuristic_periods, period_data_slices=period_data_slices, season_data=season_data, seasons=seasons) for col in nodes_data['timeseries']: prep_weather_data[col] = nodes_data['timeseries'][col] # Rename columns of the new weather_dataset weather_data = nodes_data['weather data'].copy() prep_weather_data.reset_index(drop=True, inplace=True) prep_weather_data['timestamp'] = \ weather_data['timestamp'][:len(prep_weather_data)] prep_weather_data.reset_index(drop=True) # Replace original data with hierarchical clustered data nodes_data['weather data'] = prep_weather_data.copy() nodes_data['timeseries'] = prep_weather_data.copy() # Adapts Other Parameters (despite weather data) of the energy system if period == 'weeks': period_length = 24 * 7 elif period == 'days': period_length = 24 else: raise ValueError("period chosen not possible") variable_costs_date_adaption(nodes_data=nodes_data, clusters=int(len(nodes_data['weather data']) / period_length), period=period)