Skip to content

Commit 75eecf2

Browse files
committed
Make WPC parser more fault tolerant (Fix #3921)
When parsing a part of the file fails, allow parsing to continue with the remaining parts.
1 parent 2c7e5a0 commit 75eecf2

File tree

2 files changed

+61
-38
lines changed

2 files changed

+61
-38
lines changed

src/metpy/io/text.py

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import contextlib
77
from datetime import datetime, timezone
8+
import logging
89
import re
910
import string
1011

@@ -16,6 +17,8 @@
1617

1718
exporter = Exporter(globals())
1819

20+
log = logging.getLogger(__name__)
21+
1922

2023
def _decode_coords(coordinates):
2124
"""Turn a string of coordinates from WPC coded surface bulletin into a lon/lat tuple.
@@ -107,44 +110,48 @@ def parse_wpc_surface_bulletin(bulletin, year=None):
107110
# A single file may have multiple sets of data that are valid at different times. Set
108111
# the valid_time string that will correspond to all the following lines parsed, until
109112
# the next valid_time is found.
110-
if parts[0] in ('VALID', 'SURFACE PROG VALID'):
111-
dtstr = parts[-1]
112-
valid_time = valid_time.replace(year=year or valid_time.year, month=int(dtstr[:2]),
113-
day=int(dtstr[2:4]), hour=int(dtstr[4:6]),
114-
minute=0, second=0, microsecond=0)
115-
else:
116-
feature, *info = parts
117-
if feature in {'HIGHS', 'LOWS'}:
118-
# For each pressure center, add its data as a new row
119-
# While ideally these occur in pairs, some bulletins have had multiple
120-
# locations for a single center strength value. So instead walk one at a time
121-
# and keep track of the most recent strength.
122-
strength = np.nan
123-
for item in info:
124-
if len(item) <= 4 and item[0] in {'8', '9', '1'}:
125-
strength = int(item)
113+
try:
114+
if parts[0] in ('VALID', 'SURFACE PROG VALID'):
115+
dtstr = parts[-1]
116+
valid_time = valid_time.replace(year=year or valid_time.year,
117+
month=int(dtstr[:2]), day=int(dtstr[2:4]),
118+
hour=int(dtstr[4:6]), minute=0, second=0,
119+
microsecond=0)
120+
else:
121+
feature, *info = parts
122+
if feature in {'HIGHS', 'LOWS'}:
123+
# For each pressure center, add its data as a new row
124+
# While ideally these occur in pairs, some bulletins have had multiple
125+
# locations for a single center strength value. So instead walk one at a
126+
# time and keep track of the most recent strength.
127+
strength = np.nan
128+
for item in info:
129+
if len(item) <= 4 and item[0] in {'8', '9', '1'}:
130+
strength = int(item)
131+
else:
132+
parsed_text.append((valid_time, feature.rstrip('S'), strength,
133+
Point(_decode_coords(item))))
134+
elif feature in {'WARM', 'COLD', 'STNRY', 'OCFNT', 'TROF'}:
135+
# Some bulletins include 'WK', 'MDT', or 'STG' to indicate the front's
136+
# strength. If present, separate it from the rest of the info, which gives
137+
# the position of the front.
138+
if info[0][0] in string.ascii_letters:
139+
strength, *boundary = info
126140
else:
127-
parsed_text.append((valid_time, feature.rstrip('S'), strength,
128-
Point(_decode_coords(item))))
129-
elif feature in {'WARM', 'COLD', 'STNRY', 'OCFNT', 'TROF'}:
130-
# Some bulletins include 'WK', 'MDT', or 'STG' to indicate the front's
131-
# strength. If present, separate it from the rest of the info, which gives the
132-
# position of the front.
133-
if info[0][0] in string.ascii_letters:
134-
strength, *boundary = info
135-
else:
136-
strength, boundary = np.nan, info
137-
138-
# Create a list of Points and create Line from points, if possible
139-
boundary = [Point(_decode_coords(point)) for point in boundary]
140-
boundary = LineString(boundary) if len(boundary) > 1 else boundary[0]
141-
142-
# Add new row in the data for each front
143-
parsed_text.append((valid_time, feature, strength, boundary))
144-
# Look for a year at the end of the line (from the product header)
145-
elif (year is None and len(info) >= 2 and re.match(r'\d{4}', info[-1])
146-
and re.match(r'\d{2}', info[-2])):
147-
with contextlib.suppress(ValueError):
148-
year = int(info[-1])
141+
strength, boundary = np.nan, info
142+
143+
# Create a list of Points and create Line from points, if possible
144+
boundary = [Point(_decode_coords(point)) for point in boundary]
145+
boundary = LineString(boundary) if len(boundary) > 1 else boundary[0]
146+
147+
# Add new row in the data for each front
148+
parsed_text.append((valid_time, feature, strength, boundary))
149+
# Look for a year at the end of the line (from the product header)
150+
elif (year is None and len(info) >= 2 and re.match(r'\d{4}', info[-1])
151+
and re.match(r'\d{2}', info[-2])):
152+
with contextlib.suppress(ValueError):
153+
year = int(info[-1])
154+
except ValueError:
155+
log.warning('Could not parse: %s', ' '.join(parts))
149156

150157
return pd.DataFrame(parsed_text, columns=['valid', 'feature', 'strength', 'geometry'])

tests/io/test_text.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,19 @@ def test_negative_lat():
9898
""")
9999
df = parse_wpc_surface_bulletin(sample)
100100
assert df.geometry[0] == sgeom.Point([-51, -3])
101+
102+
103+
@needs_module('shapely')
104+
def test_bad_line_continue(caplog):
105+
"""Test decoding of a file with some bad characters."""
106+
from io import BytesIO
107+
108+
sample = BytesIO(b"""VALID 062818Z
109+
HIGHS 1022 3961069 1020 3851069 1026 3750773 1022 4430845 1019 5520728
110+
LOWS 1016 4510934 1002 3441145 1003 4271229 1002 4471230 1009 4631181
111+
TROF 2971023 2831018 2691008 I2531003
112+
TROF 2911100 2681082 2511055 2431024
113+
""")
114+
df = parse_wpc_surface_bulletin(sample)
115+
assert len(df) == 11
116+
assert 'Could not parse' in caplog.text

0 commit comments

Comments
 (0)