From 69abe8a0416f649f6d8b15d4bdfe3e55bc9835e0 Mon Sep 17 00:00:00 2001 From: Sasivarnasarma Date: Wed, 1 Jul 2026 04:51:46 +0530 Subject: [PATCH] fix(youtube): handle transcript list errors gracefully Wraps YouTube transcript listing and retrieval in a try/except block. This prevents the converter from crashing and falling back to HtmlConverter when transcripts are disabled, rate-limited, or blocked. Instead, the converter now gracefully continues and returns the successfully extracted video metadata and description. --- .../converters/_youtube_converter.py | 68 ++++++++++--------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index c96e8f4f6..b94eebaa7 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -151,40 +151,44 @@ def convert( params = parse_qs(parsed_url.query) # type: ignore if "v" in params and params["v"][0]: video_id = str(params["v"][0]) - transcript_list = ytt_api.list(video_id) - languages = ["en"] - for transcript in transcript_list: - languages.append(transcript.language_code) - break try: - youtube_transcript_languages = kwargs.get( - "youtube_transcript_languages", languages - ) - # Retry the transcript fetching operation - transcript = self._retry_operation( - lambda: ytt_api.fetch( - video_id, languages=youtube_transcript_languages - ), - retries=3, # Retry 3 times - delay=2, # 2 seconds delay between retries - ) - - if transcript: - transcript_text = " ".join( - [part.text for part in transcript] - ) # type: ignore - except Exception as e: - # No transcript available - if len(languages) == 1: - print(f"Error fetching transcript: {e}") - else: - # Translate transcript into first kwarg - transcript = ( - transcript_list.find_transcript(languages) - .translate(youtube_transcript_languages[0]) - .fetch() + transcript_list = ytt_api.list(video_id) + languages = ["en"] + for transcript in transcript_list: + languages.append(transcript.language_code) + break + try: + youtube_transcript_languages = kwargs.get( + "youtube_transcript_languages", languages + ) + # Retry the transcript fetching operation + transcript = self._retry_operation( + lambda: ytt_api.fetch( + video_id, languages=youtube_transcript_languages + ), + retries=3, # Retry 3 times + delay=2, # 2 seconds delay between retries ) - transcript_text = " ".join([part.text for part in transcript]) + + if transcript: + transcript_text = " ".join( + [part.text for part in transcript] + ) # type: ignore + except Exception as e: + # No transcript available + if len(languages) == 1: + print(f"Error fetching transcript: {e}") + else: + # Translate transcript into first kwarg + transcript = ( + transcript_list.find_transcript(languages) + .translate(youtube_transcript_languages[0]) + .fetch() + ) + transcript_text = " ".join([part.text for part in transcript]) + except Exception as e: + print(f"YouTube transcript extraction failed: {e}") + transcript_text = "*(Transcript unavailable)*" if transcript_text: webpage_text += f"\n### Transcript\n{transcript_text}\n"