From 4d6b401a376b6c8a9e1d2f523325a110f5af8abe Mon Sep 17 00:00:00 2001 From: Frank Boerman Date: Sun, 6 Oct 2024 23:50:55 +0200 Subject: [PATCH] fixes generic parser to correctly handle various scenarios that can happen discussion and inspiration for fixes from #347 and #346 --- entsoe/entsoe.py | 6 +- entsoe/parsers.py | 19 +++-- entsoe/series_parsers.py | 149 ++++++++++++++++++++++----------------- 3 files changed, 101 insertions(+), 73 deletions(-) diff --git a/entsoe/entsoe.py b/entsoe/entsoe.py index 8f1078f..4425c5f 100644 --- a/entsoe/entsoe.py +++ b/entsoe/entsoe.py @@ -23,7 +23,7 @@ warnings.filterwarnings('ignore', category=XMLParsedAsHTMLWarning) __title__ = "entsoe-py" -__version__ = "0.6.8" +__version__ = "0.6.9" __author__ = "EnergieID.be, Frank Boerman" __license__ = "MIT" @@ -1152,7 +1152,7 @@ def query_withdrawn_unavailability_of_generation_units( class EntsoePandasClient(EntsoeRawClient): @year_limited def query_net_position(self, country_code: Union[Area, str], - start: pd.Timestamp, end: pd.Timestamp, dayahead: bool = True) -> pd.Series: + start: pd.Timestamp, end: pd.Timestamp, dayahead: bool = True, resolution: Literal['60min', '30min', '15min'] = '60min') -> pd.Series: """ Parameters @@ -1168,7 +1168,7 @@ def query_net_position(self, country_code: Union[Area, str], area = lookup_area(country_code) text = super(EntsoePandasClient, self).query_net_position( country_code=area, start=start, end=end, dayahead=dayahead) - series = parse_netpositions(text) + series = parse_netpositions(text, resolution=resolution) series = series.tz_convert(area.tz) series = series.truncate(before=start, after=end) return series diff --git a/entsoe/parsers.py b/entsoe/parsers.py index eb07205..4a4d556 100644 --- a/entsoe/parsers.py +++ b/entsoe/parsers.py @@ -17,7 +17,6 @@ CONSUMPTION_ELEMENT = "outBiddingZone_Domain.mRID" - def parse_prices(xml_text): """ Parameters @@ -35,15 +34,18 @@ def parse_prices(xml_text): } for soup in _extract_timeseries(xml_text): soup_series = _parse_timeseries_generic(soup, 'price.amount') - series[soup_series.index.freqstr].append(soup_series) + for key in series.keys(): + series[key].append(soup_series[key]) for freq, freq_series in series.items(): - if len(freq_series) > 0: + try: series[freq] = pd.concat(freq_series).sort_index() + except ValueError: + series[freq] = pd.Series() return series -def parse_netpositions(xml_text): +def parse_netpositions(xml_text, resolution): """ Parameters @@ -56,7 +58,7 @@ def parse_netpositions(xml_text): """ series_all = [] for soup in _extract_timeseries(xml_text): - series = _parse_timeseries_generic(soup) + series = _parse_timeseries_generic(soup)[resolution] if 'REGION' in soup.find('out_domain.mrid').text: factor = -1 # flow is import so negative else: @@ -692,7 +694,7 @@ def _parse_load_timeseries(soup): ------- pd.Series """ - return _parse_timeseries_generic(soup) + return _parse_timeseries_generic(soup, merge_series=True) def _parse_generation_timeseries(soup, per_plant: bool = False, include_eic: bool = False) -> pd.Series: """ @@ -707,7 +709,10 @@ def _parse_generation_timeseries(soup, per_plant: bool = False, include_eic: boo ------- pd.Series """ - series = _parse_timeseries_generic(soup) + # should never have duplicated timestamps when differing time resolution. + # so simply concat all possibilities + series = _parse_timeseries_generic(soup, merge_series=True) + # Check if there is a psrtype, if so, get it. _psrtype = soup.find('psrtype') diff --git a/entsoe/series_parsers.py b/entsoe/series_parsers.py index fd00391..2fc7c0f 100644 --- a/entsoe/series_parsers.py +++ b/entsoe/series_parsers.py @@ -18,6 +18,7 @@ def _extract_timeseries(xml_text): for timeseries in soup.find_all('timeseries'): yield timeseries + def _resolution_to_timedelta(res_text: str) -> str: """ Convert an Entsoe resolution to something that pandas can understand @@ -40,73 +41,95 @@ def _resolution_to_timedelta(res_text: str) -> str: "issue.".format(res_text)) return delta + def _parse_datetimeindex(soup, tz=None): - """ - Create a datetimeindex from a parsed beautifulsoup, - given that it contains the elements 'start', 'end' - and 'resolution' - - Parameters - ---------- - soup : bs4.element.tag - tz: str - - Returns - ------- - pd.DatetimeIndex - """ - start = pd.Timestamp(soup.find('start').text) - end = pd.Timestamp(soup.find_all('end')[-1].text) - if tz is not None: - start = start.tz_convert(tz) - end = end.tz_convert(tz) - - delta = _resolution_to_timedelta(res_text=soup.find('resolution').text) - index = pd.date_range(start=start, end=end, freq=delta, inclusive='left') - if tz is not None: - dst_jump = len(set(index.map(lambda d: d.dst()))) > 1 - if dst_jump and delta == "7D": - # For a weekly granularity, if we jump over the DST date in October, - # date_range erronously returns an additional index element - # because that week contains 169 hours instead of 168. - index = index[:-1] - index = index.tz_convert("UTC") - elif index.to_series().diff().min() >= pd.Timedelta('1D') and end.hour == start.hour + 1: - # For a daily or larger granularity, if we jump over the DST date in October, - # date_range erronously returns an additional index element - # because the period contains one extra hour. - index = index[:-1] - - return index - -def _parse_timeseries_generic(soup, label='quantity', to_float=True): - data = {} - for point in soup.find_all('point'): - value = point.find(label).text - if to_float: - value = value.replace(',', '') - data[int(point.find('position').text)] = value - - series = pd.Series(data) - series.sort_index() - index = _parse_datetimeindex(soup) - if soup.find('curvetype').text == 'A03': - # with A03 its possible that positions are missing, this is when values are repeated - # see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf - # so lets do reindex on a continious range which creates gaps if positions are missing - # then forward fill, so repeat last valid value, to fill the gaps - series = series.reindex(list(range(1, len(index)+1))).ffill() - - series.index = index - if to_float: - series = series.astype(float) - - return series + """ + Create a datetimeindex from a parsed beautifulsoup, + given that it contains the elements 'start', 'end' + and 'resolution' + + Parameters + ---------- + soup : bs4.element.tag + tz: str + + Returns + ------- + pd.DatetimeIndex + """ + start = pd.Timestamp(soup.find('start').text) + end = pd.Timestamp(soup.find_all('end')[-1].text) + if tz is not None: + start = start.tz_convert(tz) + end = end.tz_convert(tz) + + delta = _resolution_to_timedelta(res_text=soup.find('resolution').text) + index = pd.date_range(start=start, end=end, freq=delta, inclusive='left') + if tz is not None: + dst_jump = len(set(index.map(lambda d: d.dst()))) > 1 + if dst_jump and delta == "7D": + # For a weekly granularity, if we jump over the DST date in October, + # date_range erronously returns an additional index element + # because that week contains 169 hours instead of 168. + index = index[:-1] + index = index.tz_convert("UTC") + elif index.to_series().diff().min() >= pd.Timedelta('1D') and end.hour == start.hour + 1: + # For a daily or larger granularity, if we jump over the DST date in October, + # date_range erronously returns an additional index element + # because the period contains one extra hour. + index = index[:-1] + + return index + + +def _parse_timeseries_generic(soup, label='quantity', to_float=True, merge_series=False): + series = { + '15min': [], + '30min': [], + '60min': [] + } + + for period in soup.find_all('period'): + data = {} + start = pd.Timestamp(period.find('start').text) + end = pd.Timestamp(period.find('end').text) + delta_text = _resolution_to_timedelta(res_text=period.find('resolution').text) + delta = pd.Timedelta(delta_text) + for point in period.find_all('point'): + value = point.find(label).text + if to_float: + value = value.replace(',', '') + position = int(point.find('position').text) + data[start + (position-1)*delta] = value + S = pd.Series(data).sort_index() + if soup.find('curvetype').text == 'A03': + # with A03 its possible that positions are missing, this is when values are repeated + # see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf + # so lets do reindex on a continious range which creates gaps if positions are missing + # then forward fill, so repeat last valid value, to fill the gaps + S = S.reindex(pd.date_range(start, end-delta, freq=delta_text)).ffill() + if delta_text not in series: + series[delta_text] = [] + series[delta_text].append(S) + for freq, S in series.items(): + if len(S) > 0: + series[freq] = pd.concat(S).sort_index() + if to_float: + series[freq] = series[freq].astype(float) + else: + series[freq] = None + + # for endpoints which never has duplicated timeseries the flag merge_series signals to just concat everything + if merge_series: + return pd.concat(series.values()) + else: + return series + def _parse_timeseries_generic_whole(xml_text, label='quantity', to_float=True): series_all = [] for soup in _extract_timeseries(xml_text): - series_all.append(_parse_timeseries_generic(soup, label=label, to_float=to_float)) + series_all.append(_parse_timeseries_generic(soup, label=label, to_float=to_float, merge_series=True)) series_all = pd.concat(series_all).sort_index() - return series_all \ No newline at end of file + return series_all