Make WPC parser more fault tolerant (Fix #3921)

dopplershift · dopplershift · commit 75eecf2929a0 · 2025-09-25T15:10:36.000-06:00
When parsing a part of the file fails, allow parsing to continue with
the remaining parts.
diff --git a/src/metpy/io/text.py b/src/metpy/io/text.py
@@ -5,6 +5,7 @@
 
 import contextlib
 from datetime import datetime, timezone
+import logging
 import re
 import string
 
@@ -16,6 +17,8 @@
 
 exporter = Exporter(globals())
 
+log = logging.getLogger(__name__)
+
 
 def _decode_coords(coordinates):
     """Turn a string of coordinates from WPC coded surface bulletin into a lon/lat tuple.
@@ -107,44 +110,48 @@ def parse_wpc_surface_bulletin(bulletin, year=None):
         # A single file may have multiple sets of data that are valid at different times. Set
         # the valid_time string that will correspond to all the following lines parsed, until
         # the next valid_time is found.
-        if parts[0] in ('VALID', 'SURFACE PROG VALID'):
-            dtstr = parts[-1]
-            valid_time = valid_time.replace(year=year or valid_time.year, month=int(dtstr[:2]),
-                                            day=int(dtstr[2:4]), hour=int(dtstr[4:6]),
-                                            minute=0, second=0, microsecond=0)
-        else:
-            feature, *info = parts
-            if feature in {'HIGHS', 'LOWS'}:
-                # For each pressure center, add its data as a new row
-                # While ideally these occur in pairs, some bulletins have had multiple
-                # locations for a single center strength value. So instead walk one at a time
-                # and keep track of the most recent strength.
-                strength = np.nan
-                for item in info:
-                    if len(item) <= 4 and item[0] in {'8', '9', '1'}:
-                        strength = int(item)
+        try:
+            if parts[0] in ('VALID', 'SURFACE PROG VALID'):
+                dtstr = parts[-1]
+                valid_time = valid_time.replace(year=year or valid_time.year,
+                                                month=int(dtstr[:2]), day=int(dtstr[2:4]),
+                                                hour=int(dtstr[4:6]), minute=0, second=0,
+                                                microsecond=0)
+            else:
+                feature, *info = parts
+                if feature in {'HIGHS', 'LOWS'}:
+                    # For each pressure center, add its data as a new row
+                    # While ideally these occur in pairs, some bulletins have had multiple
+                    # locations for a single center strength value. So instead walk one at a
+                    # time and keep track of the most recent strength.
+                    strength = np.nan
+                    for item in info:
+                        if len(item) <= 4 and item[0] in {'8', '9', '1'}:
+                            strength = int(item)
+                        else:
+                            parsed_text.append((valid_time, feature.rstrip('S'), strength,
+                                                Point(_decode_coords(item))))
+                elif feature in {'WARM', 'COLD', 'STNRY', 'OCFNT', 'TROF'}:
+                    # Some bulletins include 'WK', 'MDT', or 'STG' to indicate the front's
+                    # strength. If present, separate it from the rest of the info, which gives
+                    # the position of the front.
+                    if info[0][0] in string.ascii_letters:
+                        strength, *boundary = info
                     else:
-                        parsed_text.append((valid_time, feature.rstrip('S'), strength,
-                                            Point(_decode_coords(item))))
-            elif feature in {'WARM', 'COLD', 'STNRY', 'OCFNT', 'TROF'}:
-                # Some bulletins include 'WK', 'MDT', or 'STG' to indicate the front's
-                # strength. If present, separate it from the rest of the info, which gives the
-                # position of the front.
-                if info[0][0] in string.ascii_letters:
-                    strength, *boundary = info
-                else:
-                    strength, boundary = np.nan, info
-
-                # Create a list of Points and create Line from points, if possible
-                boundary = [Point(_decode_coords(point)) for point in boundary]
-                boundary = LineString(boundary) if len(boundary) > 1 else boundary[0]
-
-                # Add new row in the data for each front
-                parsed_text.append((valid_time, feature, strength, boundary))
-            # Look for a year at the end of the line (from the product header)
-            elif (year is None and len(info) >= 2 and re.match(r'\d{4}', info[-1])
-                  and re.match(r'\d{2}', info[-2])):
-                with contextlib.suppress(ValueError):
-                    year = int(info[-1])
+                        strength, boundary = np.nan, info
+
+                    # Create a list of Points and create Line from points, if possible
+                    boundary = [Point(_decode_coords(point)) for point in boundary]
+                    boundary = LineString(boundary) if len(boundary) > 1 else boundary[0]
+
+                    # Add new row in the data for each front
+                    parsed_text.append((valid_time, feature, strength, boundary))
+                # Look for a year at the end of the line (from the product header)
+                elif (year is None and len(info) >= 2 and re.match(r'\d{4}', info[-1])
+                      and re.match(r'\d{2}', info[-2])):
+                    with contextlib.suppress(ValueError):
+                        year = int(info[-1])
+        except ValueError:
+            log.warning('Could not parse: %s', ' '.join(parts))
 
     return pd.DataFrame(parsed_text, columns=['valid', 'feature', 'strength', 'geometry'])
diff --git a/tests/io/test_text.py b/tests/io/test_text.py
@@ -98,3 +98,19 @@ def test_negative_lat():
  """)
     df = parse_wpc_surface_bulletin(sample)
     assert df.geometry[0] == sgeom.Point([-51, -3])
+
+
+@needs_module('shapely')
+def test_bad_line_continue(caplog):
+    """Test decoding of a file with some bad characters."""
+    from io import BytesIO
+
+    sample = BytesIO(b"""VALID 062818Z
+HIGHS 1022 3961069 1020 3851069 1026 3750773 1022 4430845 1019 5520728
+LOWS 1016 4510934 1002 3441145 1003 4271229 1002 4471230 1009 4631181
+TROF 2971023 2831018 2691008 I2531003
+TROF 2911100 2681082 2511055 2431024
+ """)
+    df = parse_wpc_surface_bulletin(sample)
+    assert len(df) == 11
+    assert 'Could not parse' in caplog.text