Skip to content

Commit

Permalink
fixes generic parser to correctly handle various scenarios that can h…
Browse files Browse the repository at this point in the history
…appen

discussion and inspiration for fixes from #347 and #346
  • Loading branch information
fboerman committed Oct 6, 2024
1 parent 647ae2b commit 4d6b401
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 73 deletions.
6 changes: 3 additions & 3 deletions entsoe/entsoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
warnings.filterwarnings('ignore', category=XMLParsedAsHTMLWarning)

__title__ = "entsoe-py"
__version__ = "0.6.8"
__version__ = "0.6.9"
__author__ = "EnergieID.be, Frank Boerman"
__license__ = "MIT"

Expand Down Expand Up @@ -1152,7 +1152,7 @@ def query_withdrawn_unavailability_of_generation_units(
class EntsoePandasClient(EntsoeRawClient):
@year_limited
def query_net_position(self, country_code: Union[Area, str],
start: pd.Timestamp, end: pd.Timestamp, dayahead: bool = True) -> pd.Series:
start: pd.Timestamp, end: pd.Timestamp, dayahead: bool = True, resolution: Literal['60min', '30min', '15min'] = '60min') -> pd.Series:
"""
Parameters
Expand All @@ -1168,7 +1168,7 @@ def query_net_position(self, country_code: Union[Area, str],
area = lookup_area(country_code)
text = super(EntsoePandasClient, self).query_net_position(
country_code=area, start=start, end=end, dayahead=dayahead)
series = parse_netpositions(text)
series = parse_netpositions(text, resolution=resolution)
series = series.tz_convert(area.tz)
series = series.truncate(before=start, after=end)
return series
Expand Down
19 changes: 12 additions & 7 deletions entsoe/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
CONSUMPTION_ELEMENT = "outBiddingZone_Domain.mRID"



def parse_prices(xml_text):
"""
Parameters
Expand All @@ -35,15 +34,18 @@ def parse_prices(xml_text):
}
for soup in _extract_timeseries(xml_text):
soup_series = _parse_timeseries_generic(soup, 'price.amount')
series[soup_series.index.freqstr].append(soup_series)
for key in series.keys():
series[key].append(soup_series[key])

for freq, freq_series in series.items():
if len(freq_series) > 0:
try:
series[freq] = pd.concat(freq_series).sort_index()
except ValueError:
series[freq] = pd.Series()
return series


def parse_netpositions(xml_text):
def parse_netpositions(xml_text, resolution):
"""
Parameters
Expand All @@ -56,7 +58,7 @@ def parse_netpositions(xml_text):
"""
series_all = []
for soup in _extract_timeseries(xml_text):
series = _parse_timeseries_generic(soup)
series = _parse_timeseries_generic(soup)[resolution]
if 'REGION' in soup.find('out_domain.mrid').text:
factor = -1 # flow is import so negative
else:
Expand Down Expand Up @@ -692,7 +694,7 @@ def _parse_load_timeseries(soup):
-------
pd.Series
"""
return _parse_timeseries_generic(soup)
return _parse_timeseries_generic(soup, merge_series=True)

def _parse_generation_timeseries(soup, per_plant: bool = False, include_eic: bool = False) -> pd.Series:
"""
Expand All @@ -707,7 +709,10 @@ def _parse_generation_timeseries(soup, per_plant: bool = False, include_eic: boo
-------
pd.Series
"""
series = _parse_timeseries_generic(soup)
# should never have duplicated timestamps when differing time resolution.
# so simply concat all possibilities
series = _parse_timeseries_generic(soup, merge_series=True)


# Check if there is a psrtype, if so, get it.
_psrtype = soup.find('psrtype')
Expand Down
149 changes: 86 additions & 63 deletions entsoe/series_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def _extract_timeseries(xml_text):
for timeseries in soup.find_all('timeseries'):
yield timeseries


def _resolution_to_timedelta(res_text: str) -> str:
"""
Convert an Entsoe resolution to something that pandas can understand
Expand All @@ -40,73 +41,95 @@ def _resolution_to_timedelta(res_text: str) -> str:
"issue.".format(res_text))
return delta


def _parse_datetimeindex(soup, tz=None):
"""
Create a datetimeindex from a parsed beautifulsoup,
given that it contains the elements 'start', 'end'
and 'resolution'
Parameters
----------
soup : bs4.element.tag
tz: str
Returns
-------
pd.DatetimeIndex
"""
start = pd.Timestamp(soup.find('start').text)
end = pd.Timestamp(soup.find_all('end')[-1].text)
if tz is not None:
start = start.tz_convert(tz)
end = end.tz_convert(tz)

delta = _resolution_to_timedelta(res_text=soup.find('resolution').text)
index = pd.date_range(start=start, end=end, freq=delta, inclusive='left')
if tz is not None:
dst_jump = len(set(index.map(lambda d: d.dst()))) > 1
if dst_jump and delta == "7D":
# For a weekly granularity, if we jump over the DST date in October,
# date_range erronously returns an additional index element
# because that week contains 169 hours instead of 168.
index = index[:-1]
index = index.tz_convert("UTC")
elif index.to_series().diff().min() >= pd.Timedelta('1D') and end.hour == start.hour + 1:
# For a daily or larger granularity, if we jump over the DST date in October,
# date_range erronously returns an additional index element
# because the period contains one extra hour.
index = index[:-1]

return index

def _parse_timeseries_generic(soup, label='quantity', to_float=True):
data = {}
for point in soup.find_all('point'):
value = point.find(label).text
if to_float:
value = value.replace(',', '')
data[int(point.find('position').text)] = value

series = pd.Series(data)
series.sort_index()
index = _parse_datetimeindex(soup)
if soup.find('curvetype').text == 'A03':
# with A03 its possible that positions are missing, this is when values are repeated
# see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf
# so lets do reindex on a continious range which creates gaps if positions are missing
# then forward fill, so repeat last valid value, to fill the gaps
series = series.reindex(list(range(1, len(index)+1))).ffill()

series.index = index
if to_float:
series = series.astype(float)

return series
"""
Create a datetimeindex from a parsed beautifulsoup,
given that it contains the elements 'start', 'end'
and 'resolution'
Parameters
----------
soup : bs4.element.tag
tz: str
Returns
-------
pd.DatetimeIndex
"""
start = pd.Timestamp(soup.find('start').text)
end = pd.Timestamp(soup.find_all('end')[-1].text)
if tz is not None:
start = start.tz_convert(tz)
end = end.tz_convert(tz)

delta = _resolution_to_timedelta(res_text=soup.find('resolution').text)
index = pd.date_range(start=start, end=end, freq=delta, inclusive='left')
if tz is not None:
dst_jump = len(set(index.map(lambda d: d.dst()))) > 1
if dst_jump and delta == "7D":
# For a weekly granularity, if we jump over the DST date in October,
# date_range erronously returns an additional index element
# because that week contains 169 hours instead of 168.
index = index[:-1]
index = index.tz_convert("UTC")
elif index.to_series().diff().min() >= pd.Timedelta('1D') and end.hour == start.hour + 1:
# For a daily or larger granularity, if we jump over the DST date in October,
# date_range erronously returns an additional index element
# because the period contains one extra hour.
index = index[:-1]

return index


def _parse_timeseries_generic(soup, label='quantity', to_float=True, merge_series=False):
series = {
'15min': [],
'30min': [],
'60min': []
}

for period in soup.find_all('period'):
data = {}
start = pd.Timestamp(period.find('start').text)
end = pd.Timestamp(period.find('end').text)
delta_text = _resolution_to_timedelta(res_text=period.find('resolution').text)
delta = pd.Timedelta(delta_text)
for point in period.find_all('point'):
value = point.find(label).text
if to_float:
value = value.replace(',', '')
position = int(point.find('position').text)
data[start + (position-1)*delta] = value
S = pd.Series(data).sort_index()
if soup.find('curvetype').text == 'A03':
# with A03 its possible that positions are missing, this is when values are repeated
# see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf
# so lets do reindex on a continious range which creates gaps if positions are missing
# then forward fill, so repeat last valid value, to fill the gaps
S = S.reindex(pd.date_range(start, end-delta, freq=delta_text)).ffill()
if delta_text not in series:
series[delta_text] = []
series[delta_text].append(S)
for freq, S in series.items():
if len(S) > 0:
series[freq] = pd.concat(S).sort_index()
if to_float:
series[freq] = series[freq].astype(float)
else:
series[freq] = None

# for endpoints which never has duplicated timeseries the flag merge_series signals to just concat everything
if merge_series:
return pd.concat(series.values())
else:
return series


def _parse_timeseries_generic_whole(xml_text, label='quantity', to_float=True):
series_all = []
for soup in _extract_timeseries(xml_text):
series_all.append(_parse_timeseries_generic(soup, label=label, to_float=to_float))
series_all.append(_parse_timeseries_generic(soup, label=label, to_float=to_float, merge_series=True))

series_all = pd.concat(series_all).sort_index()
return series_all
return series_all

0 comments on commit 4d6b401

Please sign in to comment.