core

Fetching transcript

download_audio

 download_audio (vid_id:str, dest_dir:pathlib.Path)

Download audio from YouTube video

	Type	Details
vid_id	str	YouTube video ID
dest_dir	Path	Output directory

Exported source

gemini_api_key = os.getenv("GEMINI_API_KEY")

Exported source

logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

Exported source

def download_audio(
    vid_id: str, # YouTube video ID
    dest_dir: Path # Output directory
    ):
    "Download audio from YouTube video"
    logging.info(f"Downloading audio for video {vid_id}")
    Path(dest_dir).mkdir(exist_ok=True)
    out_file = Path(dest_dir)/f'{vid_id}.mp3'
    if not out_file.exists():
        subprocess.run(['yt-dlp', '-x', '--audio-format', 'mp3', f'https://www.youtube.com/watch?v={vid_id}', '-o', str(out_file)], check=True)
        logging.info(f"Downloaded audio to {out_file}")
    else:
        logging.info(f"Using existing audio file {out_file}")
    return out_file

video_id = 'GJ0u09SIPh4'
download_audio(video_id, Path('../_audio'))

2025-07-20 19:09:25,533 - INFO - Downloading audio for video GJ0u09SIPh4

[youtube] Extracting URL: https://www.youtube.com/watch?v=GJ0u09SIPh4
[youtube] GJ0u09SIPh4: Downloading webpage
[youtube] GJ0u09SIPh4: Downloading tv client config
[youtube] GJ0u09SIPh4: Downloading player 69b31e11-main
[youtube] GJ0u09SIPh4: Downloading tv player API JSON
[youtube] GJ0u09SIPh4: Downloading ios player API JSON
[youtube] GJ0u09SIPh4: Downloading m3u8 information
[info] GJ0u09SIPh4: Downloading 1 format(s): 251
[download] Destination: ../_audio/GJ0u09SIPh4.webm
[download] 100% of   83.98MiB in 00:00:08 at 9.79MiB/s     
[ExtractAudio] Destination: ../_audio/GJ0u09SIPh4.mp3

2025-07-20 19:10:17,770 - INFO - Downloaded audio to ../_audio/GJ0u09SIPh4.mp3

Deleting original file ../_audio/GJ0u09SIPh4.webm (pass -k to keep)

Path('../_audio/GJ0u09SIPh4.mp3')

source

detect_silence

 detect_silence (audio_file:pathlib.Path)

Detect silence in audio file and return start and end times

Exported source

def detect_silence(audio_file:Path):
    "Detect silence in audio file and return start and end times"
    stream = ffmpeg.input(str(audio_file))
    stream = stream.filter('silencedetect', noise='-30dB', d=0.5)
    stream = stream.output('null', f='null')
    out, err = ffmpeg.run(stream, capture_stderr=True)
    return out, err

_, err = detect_silence(Path('../_audio/GJ0u09SIPh4.mp3'))

source

parse_silence_ends

 parse_silence_ends (stderr_output:bytes)

Parse silence ends from ffmpeg stderr output

Exported source

def parse_silence_ends(stderr_output:bytes):
    "Parse silence ends from ffmpeg stderr output"
    pattern = r'silence_end: ([\d.]+)'
    matches = re.findall(pattern, stderr_output.decode())
    return [float(match) for match in matches]

ends = parse_silence_ends(err); L(ends)[:10]

(#10) [0.513563,15.558687,26.482021,29.918437,32.245583,34.150583,35.980771,36.597167,39.411437,43.585812]

source

find_split_points

 find_split_points (silence_ends:list[float], total_len:float,
                    chunk_len:float=600)

Find points to split audio based on silence detection, aiming for chunks of chunk_len seconds

	Type	Default	Details
silence_ends	list		silence ends
total_len	float		total length of the audio (in seconds)
chunk_len	float	600	length of the chunks (in seconds)

Exported source

def find_split_points(
    silence_ends:list[float], # silence ends
    total_len:float, # total length of the audio (in seconds)
    chunk_len:float=600 # length of the chunks (in seconds)
    ):
    "Find points to split audio based on silence detection, aiming for chunks of `chunk_len` seconds"
    splits,target = [0],chunk_len
    for t in silence_ends:
        if t >= target:
            splits.append(t)
            target += chunk_len
    splits.append(total_len) # final chunk
    return splits

source

get_audio_duration

 get_audio_duration (audio_file:pathlib.Path|str)

Get duration of audio file in seconds

Exported source

def get_audio_duration(audio_file:"Path|str"):
    "Get duration of audio file in seconds"
    probe = ffmpeg.probe(str(audio_file))
    return float(probe['format']['duration'])

tot_len = get_audio_duration(Path('../_audio/GJ0u09SIPh4.mp3')); tot_len

6995.976

soft_splits = find_split_points(ends, tot_len); soft_splits

[0,
 603.482062,
 1202.536562,
 1802.256479,
 2401.709521,
 3004.959437,
 3605.712229,
 4206.138958,
 4800.153625,
 5400.729625,
 6003.723708,
 6610.651771,
 6995.976]

source

get_mime_type

 get_mime_type (f)

Exported source

def get_mime_type(f): return 'audio/mpeg' if Path(f).suffix.lower() == '.mp3' else 'audio/mp4'

source

split_audio

 split_audio (fname:pathlib.Path, splits:list,
              dest_dir:str|pathlib.Path='_audio_chunks')

Split audio file into chunks based on split points

	Type	Default	Details
fname	Path		Audio file to split
splits	list		List of timestamps in seconds to split at
dest_dir	str \| pathlib.Path	_audio_chunks	Directory to save chunks

Exported source

def split_audio(
    fname:"Path", # Audio file to split
    splits:"list", # List of timestamps in seconds to split at
    dest_dir:'str|Path'="_audio_chunks"): # Directory to save chunks
    "Split audio file into chunks based on split points"
    Path(dest_dir).mkdir(exist_ok=True)
    chunks = []
    for i, start_time in tqdm(enumerate(splits[:-1]), total=len(splits)-1):
        duration = splits[i+1] - start_time
        chunk_name = f"{fname.stem}_chunk_{i+1:02d}.mp3"
        output_path = Path(dest_dir)/chunk_name
        chunks.append(output_path)
        (ffmpeg
         .input(str(fname), ss=start_time, t=duration)
         .output(str(output_path), acodec='copy')
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True))
    return chunks

split_audio(Path('../_audio/GJ0u09SIPh4.mp3'), soft_splits, dest_dir='../_audio/_audio_chunks')

100%|██████████| 12/12 [00:02<00:00,  5.70it/s]

[Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_01.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_02.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_03.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_04.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_05.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_06.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_07.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_08.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_09.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_10.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_11.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_12.mp3')]

source

transcribe_audio

 transcribe_audio (chunks_dir:str|pathlib.Path,
                   dest_file:str|pathlib.Path,
                   model:str='gemini-2.0-flash-001', max_concurrent:int=3,
                   prompt:str='Please transcribe this audio file:')

Transcribe audio chunks in parallel and combine into single transcript

	Type	Default	Details
chunks_dir	str \| pathlib.Path		Directory containing audio chunks
dest_file	str \| pathlib.Path		File to save transcript to
model	str	gemini-2.0-flash-001	Gemini model to use
max_concurrent	int	3	Max concurrent transcriptions
prompt	str	Please transcribe this audio file:	Custom prompt for transcription
Returns	str

Exported source

async def transcribe_audio(
    chunks_dir:str|Path,  # Directory containing audio chunks
    dest_file:str|Path, # File to save transcript to
    model:str='gemini-2.0-flash-001', # Gemini model to use
    max_concurrent:int=3,   # Max concurrent transcriptions
    prompt:str="Please transcribe this audio file:" # Custom prompt for transcription
) -> str:
    "Transcribe audio chunks in parallel and combine into single transcript"
    semaphore = asyncio.Semaphore(max_concurrent)
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    
    async def _transcribe_chunk(chunk_path):
        async with semaphore:
            audio_data = chunk_path.read_bytes()
            audio_part = types.Part.from_bytes(
                mime_type=get_mime_type(chunk_path), 
                data=audio_data
            )
            response = await client.aio.models.generate_content(
                model=model,
                contents=[prompt, audio_part]
            )
            return response.text
    
    chunks = sorted(Path(chunks_dir).glob("*.mp3"))
    tasks = [_transcribe_chunk(chunk) for chunk in chunks]
    transcripts = await asyncio.gather(*tasks)
    
    full_transcript = '\n'.join(transcripts)
    dest_path = Path(dest_file)
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    dest_path.write_text(full_transcript)
    return full_transcript

transcript = await transcribe_audio(
    chunks_dir="../_audio/_audio_chunks", 
    dest_file="../_transcripts/transcript.txt",
    prompt="Please transcribe this audio file verbatim. Note that this is an academic course in French from College de France. The transcript should be in French."
)

2025-07-20 19:21:07,363 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:07,403 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:07,446 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:21,297 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:21,307 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:21,745 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:21,758 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:22,070 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:22,075 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:35,576 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:35,584 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:36,409 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:36,418 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:37,558 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:37,597 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:48,528 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:48,536 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:50,344 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:50,352 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:51,937 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:51,946 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:22:02,123 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:22:02,547 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:22:05,145 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"

print(transcript[:1000])

Bien, mesdames messieurs. Merci d'être venus euh aussi nombreux. Cette affluence est impressionnante. Euh, je salue
toutes les personnes aussi qui vont nous suivre sur internet et peut-être sur France Culture, je ne sais pas 
encore. Et j'espère pouvoir, je vais utiliser des diapositives et pouvoir les mettre euh en ligne et les rendre 
disponibles en ligne sur le site du du Collège de France. Alors après la leçon inaugurale prononcée le 5 avril, 
donc nous entamons aujourd'hui les cours de l'année universitaire, de ce qui reste de l'année universitaire 
2017-2018, cette année étant déjà très avancée, les contraintes de salle et de calendrier étant ce qu'elles sont, 
il n'a pas été possible de programmer ces cours suivant un rythme hebdomadaire régulier et c'est l'année prochaine 
seulement que nous aurons droit à un créneau fixe à compter du mois de janvier euh sachant toutefois que le 
séminaire public de l'année prochaine qui va doubler le cours donc commencera en fait dès la fin novembre.

print(transcript[-1000:])

us les enfants, les les personnes qui vivent ou les adultes qui vivent avec leurs parents. Et l'exploitation des 
données du recensement se fait dans le cadre du ménage, et pas seulement dans un cadre individuel. Et donc, chaque 
fois qu'un euh une personne, adulte ou enfant, vit encore avec ses parents, on a le le renseignement pour ses 
parents, euh on a les les les fameuses variables en question pour ses parents. Donc, il y aurait pas une rupture 
radicale à introduire ces questions, mais euh toute une série d'associations euh ou de syndicats sont opposés à 
cette introduction et parle de de recensement des origines, de fichage des origines, et cetera. Je rappelle que le 
recensement est anonyme, hein, et que il est évidemment étroitement surveillé, mais je je conçois très bien que 
tout ceci puisse se discuter. Voilà, et nous avons abordé déjà au cours de ce cours euh bien des bien des choses. 
Je vous remercie de votre formidable attention et je vous donne rendez-vous euh à lundi pour euh