core

Fetching transcript

source

download_audio

 download_audio (vid_id:str, dest_dir:pathlib.Path)

Download audio from YouTube video

Type Details
vid_id str YouTube video ID
dest_dir Path Output directory
Exported source
gemini_api_key = os.getenv("GEMINI_API_KEY")
Exported source
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
Exported source
def download_audio(
    vid_id: str, # YouTube video ID
    dest_dir: Path # Output directory
    ):
    "Download audio from YouTube video"
    logging.info(f"Downloading audio for video {vid_id}")
    Path(dest_dir).mkdir(exist_ok=True)
    out_file = Path(dest_dir)/f'{vid_id}.mp3'
    if not out_file.exists():
        subprocess.run(['yt-dlp', '-x', '--audio-format', 'mp3', f'https://www.youtube.com/watch?v={vid_id}', '-o', str(out_file)], check=True)
        logging.info(f"Downloaded audio to {out_file}")
    else:
        logging.info(f"Using existing audio file {out_file}")
    return out_file
video_id = 'GJ0u09SIPh4'
download_audio(video_id, Path('../_audio'))
2025-07-20 19:09:25,533 - INFO - Downloading audio for video GJ0u09SIPh4
[youtube] Extracting URL: https://www.youtube.com/watch?v=GJ0u09SIPh4
[youtube] GJ0u09SIPh4: Downloading webpage
[youtube] GJ0u09SIPh4: Downloading tv client config
[youtube] GJ0u09SIPh4: Downloading player 69b31e11-main
[youtube] GJ0u09SIPh4: Downloading tv player API JSON
[youtube] GJ0u09SIPh4: Downloading ios player API JSON
[youtube] GJ0u09SIPh4: Downloading m3u8 information
[info] GJ0u09SIPh4: Downloading 1 format(s): 251
[download] Destination: ../_audio/GJ0u09SIPh4.webm
[download] 100% of   83.98MiB in 00:00:08 at 9.79MiB/s     
[ExtractAudio] Destination: ../_audio/GJ0u09SIPh4.mp3
2025-07-20 19:10:17,770 - INFO - Downloaded audio to ../_audio/GJ0u09SIPh4.mp3
Deleting original file ../_audio/GJ0u09SIPh4.webm (pass -k to keep)
Path('../_audio/GJ0u09SIPh4.mp3')

source

detect_silence

 detect_silence (audio_file:pathlib.Path)

Detect silence in audio file and return start and end times

Exported source
def detect_silence(audio_file:Path):
    "Detect silence in audio file and return start and end times"
    stream = ffmpeg.input(str(audio_file))
    stream = stream.filter('silencedetect', noise='-30dB', d=0.5)
    stream = stream.output('null', f='null')
    out, err = ffmpeg.run(stream, capture_stderr=True)
    return out, err
_, err = detect_silence(Path('../_audio/GJ0u09SIPh4.mp3'))

source

parse_silence_ends

 parse_silence_ends (stderr_output:bytes)

Parse silence ends from ffmpeg stderr output

Exported source
def parse_silence_ends(stderr_output:bytes):
    "Parse silence ends from ffmpeg stderr output"
    pattern = r'silence_end: ([\d.]+)'
    matches = re.findall(pattern, stderr_output.decode())
    return [float(match) for match in matches]
ends = parse_silence_ends(err); L(ends)[:10]
(#10) [0.513563,15.558687,26.482021,29.918437,32.245583,34.150583,35.980771,36.597167,39.411437,43.585812]

source

find_split_points

 find_split_points (silence_ends:list[float], total_len:float,
                    chunk_len:float=600)

Find points to split audio based on silence detection, aiming for chunks of chunk_len seconds

Type Default Details
silence_ends list silence ends
total_len float total length of the audio (in seconds)
chunk_len float 600 length of the chunks (in seconds)
Exported source
def find_split_points(
    silence_ends:list[float], # silence ends
    total_len:float, # total length of the audio (in seconds)
    chunk_len:float=600 # length of the chunks (in seconds)
    ):
    "Find points to split audio based on silence detection, aiming for chunks of `chunk_len` seconds"
    splits,target = [0],chunk_len
    for t in silence_ends:
        if t >= target:
            splits.append(t)
            target += chunk_len
    splits.append(total_len) # final chunk
    return splits

source

get_audio_duration

 get_audio_duration (audio_file:pathlib.Path|str)

Get duration of audio file in seconds

Exported source
def get_audio_duration(audio_file:"Path|str"):
    "Get duration of audio file in seconds"
    probe = ffmpeg.probe(str(audio_file))
    return float(probe['format']['duration'])
tot_len = get_audio_duration(Path('../_audio/GJ0u09SIPh4.mp3')); tot_len
6995.976
soft_splits = find_split_points(ends, tot_len); soft_splits
[0,
 603.482062,
 1202.536562,
 1802.256479,
 2401.709521,
 3004.959437,
 3605.712229,
 4206.138958,
 4800.153625,
 5400.729625,
 6003.723708,
 6610.651771,
 6995.976]

source

get_mime_type

 get_mime_type (f)
Exported source
def get_mime_type(f): return 'audio/mpeg' if Path(f).suffix.lower() == '.mp3' else 'audio/mp4'

source

split_audio

 split_audio (fname:pathlib.Path, splits:list,
              dest_dir:str|pathlib.Path='_audio_chunks')

Split audio file into chunks based on split points

Type Default Details
fname Path Audio file to split
splits list List of timestamps in seconds to split at
dest_dir str | pathlib.Path _audio_chunks Directory to save chunks
Exported source
def split_audio(
    fname:"Path", # Audio file to split
    splits:"list", # List of timestamps in seconds to split at
    dest_dir:'str|Path'="_audio_chunks"): # Directory to save chunks
    "Split audio file into chunks based on split points"
    Path(dest_dir).mkdir(exist_ok=True)
    chunks = []
    for i, start_time in tqdm(enumerate(splits[:-1]), total=len(splits)-1):
        duration = splits[i+1] - start_time
        chunk_name = f"{fname.stem}_chunk_{i+1:02d}.mp3"
        output_path = Path(dest_dir)/chunk_name
        chunks.append(output_path)
        (ffmpeg
         .input(str(fname), ss=start_time, t=duration)
         .output(str(output_path), acodec='copy')
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True))
    return chunks
split_audio(Path('../_audio/GJ0u09SIPh4.mp3'), soft_splits, dest_dir='../_audio/_audio_chunks')
100%|██████████| 12/12 [00:02<00:00,  5.70it/s]
[Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_01.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_02.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_03.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_04.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_05.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_06.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_07.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_08.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_09.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_10.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_11.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_12.mp3')]

source

transcribe_audio

 transcribe_audio (chunks_dir:str|pathlib.Path,
                   dest_file:str|pathlib.Path,
                   model:str='gemini-2.0-flash-001', max_concurrent:int=3,
                   prompt:str='Please transcribe this audio file:')

Transcribe audio chunks in parallel and combine into single transcript

Type Default Details
chunks_dir str | pathlib.Path Directory containing audio chunks
dest_file str | pathlib.Path File to save transcript to
model str gemini-2.0-flash-001 Gemini model to use
max_concurrent int 3 Max concurrent transcriptions
prompt str Please transcribe this audio file: Custom prompt for transcription
Returns str
Exported source
async def transcribe_audio(
    chunks_dir:str|Path,  # Directory containing audio chunks
    dest_file:str|Path, # File to save transcript to
    model:str='gemini-2.0-flash-001', # Gemini model to use
    max_concurrent:int=3,   # Max concurrent transcriptions
    prompt:str="Please transcribe this audio file:" # Custom prompt for transcription
) -> str:
    "Transcribe audio chunks in parallel and combine into single transcript"
    semaphore = asyncio.Semaphore(max_concurrent)
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    
    async def _transcribe_chunk(chunk_path):
        async with semaphore:
            audio_data = chunk_path.read_bytes()
            audio_part = types.Part.from_bytes(
                mime_type=get_mime_type(chunk_path), 
                data=audio_data
            )
            response = await client.aio.models.generate_content(
                model=model,
                contents=[prompt, audio_part]
            )
            return response.text
    
    chunks = sorted(Path(chunks_dir).glob("*.mp3"))
    tasks = [_transcribe_chunk(chunk) for chunk in chunks]
    transcripts = await asyncio.gather(*tasks)
    
    full_transcript = '\n'.join(transcripts)
    dest_path = Path(dest_file)
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    dest_path.write_text(full_transcript)
    return full_transcript
transcript = await transcribe_audio(
    chunks_dir="../_audio/_audio_chunks", 
    dest_file="../_transcripts/transcript.txt",
    prompt="Please transcribe this audio file verbatim. Note that this is an academic course in French from College de France. The transcript should be in French."
)
2025-07-20 19:21:07,363 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:07,403 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:07,446 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:21,297 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:21,307 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:21,745 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:21,758 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:22,070 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:22,075 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:35,576 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:35,584 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:36,409 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:36,418 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:37,558 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:37,597 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:48,528 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:48,536 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:50,344 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:50,352 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:51,937 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:51,946 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:22:02,123 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:22:02,547 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:22:05,145 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
print(transcript[:1000])
Bien, mesdames messieurs. Merci d'être venus euh aussi nombreux. Cette affluence est impressionnante. Euh, je salue
toutes les personnes aussi qui vont nous suivre sur internet et peut-être sur France Culture, je ne sais pas 
encore. Et j'espère pouvoir, je vais utiliser des diapositives et pouvoir les mettre euh en ligne et les rendre 
disponibles en ligne sur le site du du Collège de France. Alors après la leçon inaugurale prononcée le 5 avril, 
donc nous entamons aujourd'hui les cours de l'année universitaire, de ce qui reste de l'année universitaire 
2017-2018, cette année étant déjà très avancée, les contraintes de salle et de calendrier étant ce qu'elles sont, 
il n'a pas été possible de programmer ces cours suivant un rythme hebdomadaire régulier et c'est l'année prochaine 
seulement que nous aurons droit à un créneau fixe à compter du mois de janvier euh sachant toutefois que le 
séminaire public de l'année prochaine qui va doubler le cours donc commencera en fait dès la fin novembre.
print(transcript[-1000:])
us les enfants, les les personnes qui vivent ou les adultes qui vivent avec leurs parents. Et l'exploitation des 
données du recensement se fait dans le cadre du ménage, et pas seulement dans un cadre individuel. Et donc, chaque 
fois qu'un euh une personne, adulte ou enfant, vit encore avec ses parents, on a le le renseignement pour ses 
parents, euh on a les les les fameuses variables en question pour ses parents. Donc, il y aurait pas une rupture 
radicale à introduire ces questions, mais euh toute une série d'associations euh ou de syndicats sont opposés à 
cette introduction et parle de de recensement des origines, de fichage des origines, et cetera. Je rappelle que le 
recensement est anonyme, hein, et que il est évidemment étroitement surveillé, mais je je conçois très bien que 
tout ceci puisse se discuter. Voilà, et nous avons abordé déjà au cours de ce cours euh bien des bien des choses. 
Je vous remercie de votre formidable attention et je vous donne rendez-vous euh à lundi pour euh