Fix, clean markdown headings and enrich it with figures description, …

This module aims to fix and enrich markdown headings from OCR’d PDF files by:

  1. Fixing heading hierarchy that was corrupted during OCR
  2. Adding page numbers to headings for better navigation
  3. Enriching figure references with descriptive text and creating a table of figures
Exported source
cfg = AttrDict({
    'enhanced_dir': 'enhanced',
    'enriched_dir': 'enriched',
    'lm': 'gemini/gemini-2.0-flash-exp',
    'api_key': GEMINI_API_KEY,
    'max_tokens': 8192,
    'track_usage': False,
    'img_dir': 'img'
})

Fixing Markdown Headings

# doc = src_dir / 'abridged_evaluation_report_final_olta_ndoja_pdf'
doc = src_dir / 'final_evaluation_report_final_olta_ndoja_pdf'
pages = doc.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1])); pages
(#142) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_20.md')...]

source

setup_enhanced_dir

 setup_enhanced_dir (src_dir, enhanced_dir_name='enhanced')

Create enhanced directory and copy all markdown files to it

Type Default Details
src_dir Source directory path
enhanced_dir_name str enhanced Name of enhanced subdirectory
Exported source
def setup_enhanced_dir(
    src_dir, # Source directory path
    enhanced_dir_name=cfg.enhanced_dir # Name of enhanced subdirectory
    ):
    "Create enhanced directory and copy all markdown files to it"
    src_path = Path(src_dir)
    enhanced_path = src_path / enhanced_dir_name
    enhanced_path.mkdir(exist_ok=True)
    for f in src_path.ls(file_exts=".md"): shutil.copy(f, enhanced_path)
    return enhanced_path

For instance:

setup_enhanced_dir(doc, 'enhanced')
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced')

source

get_hdgs

 get_hdgs (md_txt)
Exported source
def get_hdgs(md_txt): return re.findall(r'^#+.*$', md_txt, re.MULTILINE)

source

get_hdgs_with_pages

 get_hdgs_with_pages (pages:list[pathlib.Path])

Get headings and the page number they are on

Type Details
pages list List of pages
Exported source
def get_hdgs_with_pages(
    pages: list[Path] # List of pages
    ):
    "Get headings and the page number they are on"
    headings = []
    for i, page in enumerate(pages, 1):  # page numbers start at 1
        page_headings = get_hdgs(page.read_text())
        # add each heading with its page number
        for o in page_headings:
            headings.append({'heading': o, 'page': i})
    return headings
hdgs = get_hdgs_with_pages(pages); hdgs[:5]
[{'heading': '# **PPMi**', 'page': 1},
 {'heading': '# LIST OF FIGURES ', 'page': 5},
 {'heading': '# Abbreviations and terminology ', 'page': 6},
 {'heading': '# Key terminology ', 'page': 8},
 {'heading': '# Executive summary ', 'page': 10}]
toc = L([get_hdgs(p.read_text()) for p in pages]).concat(); toc[:5]
(#5) ['# **PPMi**','# LIST OF FIGURES ','# Abbreviations and terminology ','# Key terminology ','# Executive summary ']

source

format_hdgs

 format_hdgs (hdgs:list[dict])

Format headings with page numbers

Type Details
hdgs list List of headings with page numbers
Exported source
def format_hdgs(
    hdgs: list[dict] # List of headings with page numbers
    ):
    "Format headings with page numbers"
    formatted = []
    page_positions = {}
    
    for item in hdgs:
        page = item['page']
        page_positions[page] = page_positions.get(page, 0) + 1
        formatted.append(f"{item['heading']} (Page {page}, Position {page_positions[page]})")
    
    return "\n".join(formatted)
print(format_hdgs(hdgs)[:500])
# **PPMi** (Page 1, Position 1)
# LIST OF FIGURES  (Page 5, Position 1)
# Abbreviations and terminology  (Page 6, Position 1)
# Key terminology  (Page 8, Position 1)
# Executive summary  (Page 10, Position 1)
## Background (Page 10, Position 2)
# Methodology  (Page 11, Position 1)
# Findings  (Page 12, Position 1)
## Relevance (Page 12, Position 2)
# Coherence  (Page 13, Position 1)
## $4.3 / 5$ (Page 13, Position 2)
# Effectiveness  (Page 14, Position 1)
## Specific Outcome 1: (Page 14, Positio

source

HeadingResult

 HeadingResult (old:str, page:int, position:int, new:str, changed:bool)

*!!! abstract “Usage Documentation” Models

A base class for creating Pydantic models.

Attributes: class_vars: The names of the class variables defined on the model. private_attributes: Metadata about the private attributes of the model. signature: The synthesized __init__ [Signature][inspect.Signature] of the model.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.*
Exported source
lm = dspy.LM(cfg.lm, api_key=cfg.api_key)
dspy.configure(lm=lm)
dspy.settings.configure(track_usage=cfg.track_usage)
Exported source
class HeadingResult(BaseModel):
    old: str
    page: int
    position: int
    new: str
    changed: bool  # True if correction was made

source

FixHeadingHierarchy

 FixHeadingHierarchy (headings_with_pages:str,
                      results:List[__main__.HeadingResult])

Fix markdown heading hierarchy by analyzing the document’s numbering patterns: - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.) - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level - When a section number is lower than a previously seen number at the same level (e.g., seeing ‘2.’ after ‘3.1’), it’s likely a subsection or list item, not a main section - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections - Return ALL headings with their corrected form

Exported source
class FixHeadingHierarchy(dspy.Signature):
    """Fix markdown heading hierarchy by analyzing the document's numbering patterns:
    - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.)
    - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level
    - When a section number is lower than a previously seen number at the same level (e.g., seeing '2.' after '3.1'), it's likely a subsection or list item, not a main section
    - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections
    - Return ALL headings with their corrected form
    """
    
    headings_with_pages: str = dspy.InputField(desc="List of headings with page numbers")
    results: List[HeadingResult] = dspy.OutputField(desc="All headings with corrections and change status")

source

fix_md

 fix_md (hdgs:list[dict], track_usage:bool=False)

Fix markdown headings

Type Default Details
hdgs list List of headings with page numbers
track_usage bool False
Exported source
def fix_md(
    hdgs: list[dict], # List of headings with page numbers
    track_usage: bool=cfg.track_usage,
    ):
    "Fix markdown headings"
    lm = dspy.LM(cfg.lm, api_key=cfg.api_key, max_tokens=cfg.max_tokens)
    dspy.configure(lm=lm)
    dspy.settings.configure(track_usage=track_usage)

    inp = format_hdgs(hdgs)
    fix_hdgs = dspy.ChainOfThought(FixHeadingHierarchy)
    result = fix_hdgs(headings_with_pages=inp)
    return result
result = fix_md(hdgs, track_usage=True)
print("Result:", result)
print("Usage:", result.get_lm_usage())
Result: Prediction(
    reasoning='The document uses a combination of numbered and unnumbered headings. The numbered headings follow a 
hierarchical structure (e.g., 2, 2.1, 2.1.1). I will use this numbering to determine the appropriate heading level.
Unnumbered headings will be assigned a level based on their context within the numbered headings. Headings like 
"LIST OF FIGURES", "Abbreviations and terminology", "Key terminology", and "Executive summary" at the beginning 
will be treated as top-level headings. "Part" headings will also be top-level headings.',
    results=[HeadingResult(old='# **PPMi**', page=1, position=1, new='# **PPMi**', changed=False), 
HeadingResult(old='# LIST OF FIGURES', page=5, position=1, new='# LIST OF FIGURES', changed=False), 
HeadingResult(old='# Abbreviations and terminology', page=6, position=1, new='# Abbreviations and terminology', 
changed=False), HeadingResult(old='# Key terminology', page=8, position=1, new='# Key terminology', changed=False),
HeadingResult(old='# Executive summary', page=10, position=1, new='# Executive summary', changed=False), 
HeadingResult(old='## Background', page=10, position=2, new='## Background', changed=False), HeadingResult(old='# 
Methodology', page=11, position=1, new='# Methodology', changed=False), HeadingResult(old='# Findings', page=12, 
position=1, new='# Findings', changed=False), HeadingResult(old='## Relevance', page=12, position=2, new='## 
Relevance', changed=False), HeadingResult(old='# Coherence', page=13, position=1, new='# Coherence', 
changed=False), HeadingResult(old='## $4.3 / 5$', page=13, position=2, new='## $4.3 / 5$', changed=True), 
HeadingResult(old='# Effectiveness', page=14, position=1, new='# Effectiveness', changed=False), 
HeadingResult(old='## Specific Outcome 1:', page=14, position=2, new='## Specific Outcome 1:', changed=False), 
HeadingResult(old='## Specific Outcome 2:', page=14, position=3, new='## Specific Outcome 2:', changed=False), 
HeadingResult(old='# Specific Outcome 3:', page=15, position=1, new='# Specific Outcome 3:', changed=False), 
HeadingResult(old='## Efficiency', page=15, position=2, new='## Efficiency', changed=False), HeadingResult(old='# 
Sustainability', page=16, position=1, new='# Sustainability', changed=False), HeadingResult(old='## Conclusions and
recommendations', page=16, position=2, new='## Conclusions and recommendations', changed=False), 
HeadingResult(old='# 1. Introduction', page=18, position=1, new='# 1. Introduction', changed=False), 
HeadingResult(old='# Part 1: Background and methodology', page=19, position=1, new='# Part 1: Background and 
methodology', changed=False), HeadingResult(old='# 2. Background to the JI-HoA', page=20, position=1, new='# 2. 
Background to the JI-HoA', changed=False), HeadingResult(old='### 2.1. Context and design of the JI-HoA', page=20, 
position=2, new='## 2.1. Context and design of the JI-HoA', changed=False), HeadingResult(old='# 2.2. External 
factors affecting the implementation of the JI-HoA', page=23, position=1, new='# 2.2. External factors affecting 
the implementation of the JI-HoA', changed=False), HeadingResult(old='# 3. Methodology of the evaluation', page=26,
position=1, new='# 3. Methodology of the evaluation', changed=False), HeadingResult(old='### 3.1. Evaluation 
framework', page=26, position=2, new='## 3.1. Evaluation framework', changed=False), HeadingResult(old='# TABLE 2. 
INTERVENTION LOGIC', page=27, position=1, new='# TABLE 2. INTERVENTION LOGIC', changed=False), HeadingResult(old='#
3.2. Evaluation matrix', page=29, position=1, new='# 3.2. Evaluation matrix', changed=False), 
HeadingResult(old='### 3.3. Data collection', page=29, position=2, new='## 3.3. Data collection', changed=False), 
HeadingResult(old='# 3.4. Scoring system', page=30, position=1, new='# 3.4. Scoring system', changed=False), 
HeadingResult(old='# 3.5. Limitations', page=32, position=1, new='# 3.5. Limitations', changed=False), 
HeadingResult(old='# Part 2: Findings', page=33, position=1, new='# Part 2: Findings', changed=False), 
HeadingResult(old='# 4. Relevance', page=34, position=1, new='# 4. Relevance', changed=False), 
HeadingResult(old='## Overall performance score for relevance: 3.9/5.', page=34, position=2, new='## Overall 
performance score for relevance: 3.9/5.', changed=False), HeadingResult(old='## Robustness score for the evidence: 
4.5/5.', page=34, position=3, new='## Robustness score for the evidence: 4.5/5.', changed=False), 
HeadingResult(old='### 4.1. Relevance of programme activities for migrants, returnees, and communities', page=34, 
position=4, new='### 4.1. Relevance of programme activities for migrants, returnees, and communities', 
changed=False), HeadingResult(old='### 4.1.1. Needs of migrants', page=34, position=5, new='#### 4.1.1. Needs of 
migrants', changed=False), HeadingResult(old='# 4.1.2. Needs of returnees', page=35, position=1, new='# 4.1.2. 
Needs of returnees', changed=False), HeadingResult(old='# 4.1.3. Needs of community members', page=38, position=1, 
new='# 4.1.3. Needs of community members', changed=False), HeadingResult(old="# 4.2. Programme's relevance to the 
needs of stakeholders", page=39, position=1, new="# 4.2. Programme's relevance to the needs of stakeholders", 
changed=False), HeadingResult(old='### 4.2.1. Needs of governments', page=39, position=2, new='### 4.2.1. Needs of 
governments', changed=False), HeadingResult(old='# 4.2.2. Needs of other stakeholders', page=40, position=1, new='#
4.2.2. Needs of other stakeholders', changed=False), HeadingResult(old='# 4.3. Involvement of stakeholders in the 
design, implementation and monitoring of the programme', page=41, position=1, new='# 4.3. Involvement of 
stakeholders in the design, implementation and monitoring of the programme', changed=False), HeadingResult(old='# 
4.4. Horizontal priorities', page=42, position=1, new='# 4.4. Horizontal priorities', changed=False), 
HeadingResult(old='### 4.4.1. Gender equality', page=42, position=2, new='### 4.4.1. Gender equality', 
changed=False), HeadingResult(old='# 4.4.2. Persons with disabilities', page=43, position=1, new='# 4.4.2. Persons 
with disabilities', changed=False), HeadingResult(old='### 4.4.3. Protection', page=43, position=2, new='### 4.4.3.
Protection', changed=False), HeadingResult(old='# 4.4.4. Environmental sustainability', page=44, position=1, new='#
4.4.4. Environmental sustainability', changed=False), HeadingResult(old='# 5. Coherence', page=46, position=1, 
new='# 5. Coherence', changed=False), HeadingResult(old='## Overall performance score for coherence: $4.3 / 5$.', 
page=46, position=2, new='## Overall performance score for coherence: $4.3 / 5$.', changed=False), 
HeadingResult(old='## Robustness score for the evidence: $4 / 5$.', page=46, position=3, new='## Robustness score 
for the evidence: $4 / 5$.', changed=False), HeadingResult(old="### 5.1. The JI-HoA's alignment with the objectives
and standards of IOM, and the objectives of the EU", page=46, position=4, new="### 5.1. The JI-HoA's alignment with
the objectives and standards of IOM, and the objectives of the EU", changed=False), HeadingResult(old='### 5.1.1. 
Objectives of the IOM', page=46, position=5, new='#### 5.1.1. Objectives of the IOM', changed=False), 
HeadingResult(old='# 5.1.2. Objectives of the EU', page=48, position=1, new='# 5.1.2. Objectives of the EU', 
changed=False), HeadingResult(old='# 5.1.3. Government initiatives', page=49, position=1, new='# 5.1.3. Government 
initiatives', changed=False), HeadingResult(old='# 5.2. Alignment with other initiatives', page=50, position=1, 
new='# 5.2. Alignment with other initiatives', changed=False), HeadingResult(old='### 5.2.1. Initiatives of 
regional and continental institutions', page=50, position=2, new='### 5.2.1. Initiatives of regional and 
continental institutions', changed=False), HeadingResult(old='# 5.2.2. Initiatives by other (UN) organisations', 
page=51, position=1, new='# 5.2.2. Initiatives by other (UN) organisations', changed=False), HeadingResult(old='# 
6. Effectiveness and impact', page=54, position=1, new='# 6. Effectiveness and impact', changed=False), 
HeadingResult(old='## Overall performance score for effectiveness: 3.8/5.', page=54, position=2, new='## Overall 
performance score for effectiveness: 3.8/5.', changed=False), HeadingResult(old="## Overall score on IOM's 
achievements: 3.4/5.", page=54, position=3, new="## Overall score on IOM's achievements: 3.4/5.", changed=False), 
HeadingResult(old='## Robustness score for the evidence: $4 / 5$.', page=54, position=4, new='## Robustness score 
for the evidence: $4 / 5$.', changed=False), HeadingResult(old="### 6.1. Design and achievement of targets for the 
programme's indicators", page=54, position=5, new="### 6.1. Design and achievement of targets for the programme's 
indicators", changed=False), HeadingResult(old='# 6.2. Specific Objective 1: partner countries and relevant 
stakeholders developed or strengthened evidence-based return and reintegration procedures', page=57, position=1, 
new='# 6.2. Specific Objective 1: partner countries and relevant stakeholders developed or strengthened 
evidence-based return and reintegration procedures', changed=False), HeadingResult(old='### 6.2.1. Achievement of 
outputs and results', page=57, position=2, new='### 6.2.1. Achievement of outputs and results', changed=False), 
HeadingResult(old='## Data availability', page=57, position=3, new='#### Data availability', changed=True), 
HeadingResult(old='# Capacity of stakeholders', page=58, position=1, new='### Capacity of stakeholders', 
changed=True), HeadingResult(old='# Capacity of the African Union Commission', page=61, position=1, new='### 
Capacity of the African Union Commission', changed=True), HeadingResult(old='# 6.2.2. Achievement of Specific 
Objective 1', page=62, position=1, new='# 6.2.2. Achievement of Specific Objective 1', changed=False), 
HeadingResult(old='# 6.3. Specific Objective 2: safe, humane, dignified voluntary return processes are enhanced 
along main migration routes', page=64, position=1, new='# 6.3. Specific Objective 2: safe, humane, dignified 
voluntary return processes are enhanced along main migration routes', changed=False), HeadingResult(old='### 6.3.1.
Achievement of outputs and results', page=64, position=2, new='### 6.3.1. Achievement of outputs and results', 
changed=False), HeadingResult(old='## Outreach and awareness', page=64, position=3, new='#### Outreach and 
awareness', changed=True), HeadingResult(old='# Assistance to stranded migrants', page=66, position=1, new='### 
Assistance to stranded migrants', changed=True), HeadingResult(old='# 6.3.2. Achievement of Specific Objective 2', 
page=67, position=1, new='# 6.3.2. Achievement of Specific Objective 2', changed=False), HeadingResult(old='## 6.4.
Specific Objective 3: returnees are sustainably integrated in host communities, and host communities are better 
able to create living standards that address drivers of migration.', page=67, position=2, new='## 6.4. Specific 
Objective 3: returnees are sustainably integrated in host communities, and host communities are better able to 
create living standards that address drivers of migration.', changed=False), HeadingResult(old='# 6.4.1. 
Achievement of outputs and results', page=68, position=1, new='# 6.4.1. Achievement of outputs and results', 
changed=True), HeadingResult(old='## Individual and community-based reintegration', page=68, position=2, new='## 
Individual and community-based reintegration', changed=True), HeadingResult(old='# M\\&E systems', page=70, 
position=1, new='### M&E systems', changed=True), HeadingResult(old='# 6.4.2. Achievement of Specific Objective 3',
page=71, position=1, new='# 6.4.2. Achievement of Specific Objective 3', changed=False), HeadingResult(old='## 
Overall achievement of reintegration', page=71, position=2, new='## Overall achievement of reintegration', 
changed=True), HeadingResult(old='# 6.5. Assessing vulnerabilities', page=77, position=1, new='### 6.5. Assessing 
vulnerabilities', changed=True), HeadingResult(old='### 6.5.1. Achievements and challenges in screening migrant 
vulnerabilities and assessing eligibility for support', page=77, position=2, new='#### 6.5.1. Achievements and 
challenges in screening migrant vulnerabilities and assessing eligibility for support', changed=False), 
HeadingResult(old='# 6.5.2. Contact and communication with beneficiaries', page=79, position=1, new='# 6.5.2. 
Contact and communication with beneficiaries', changed=False), HeadingResult(old='# 6.6. Functioning of the 
integrated approach', page=81, position=1, new='# 6.6. Functioning of the integrated approach', changed=False), 
HeadingResult(old='# 7. Efficiency', page=83, position=1, new='# 7. Efficiency', changed=False), 
HeadingResult(old='## Overall performance score for efficiency: 4.3/5.', page=83, position=2, new='## Overall 
performance score for efficiency: 4.3/5.', changed=False), HeadingResult(old='## Robustness score for the evidence:
3.5/5.', page=83, position=3, new='## Robustness score for the evidence: 3.5/5.', changed=False), 
HeadingResult(old='### 7.1. Did the programme receive sufficient resources to achieve its objectives?', page=83, 
position=4, new='### 7.1. Did the programme receive sufficient resources to achieve its objectives?', 
changed=False), HeadingResult(old="### 7.1.1. To what extent were financial resources sufficient to meet the 
programme's objectives?", page=83, position=5, new="#### 7.1.1. To what extent were financial resources sufficient 
to meet the programme's objectives?", changed=False), HeadingResult(old='# 7.1.2. To what extent was the "top-up" 
funding system efficient for planning and budgeting?', page=86, position=1, new='# 7.1.2. To what extent was the 
"top-up" funding system efficient for planning and budgeting?', changed=False), HeadingResult(old="# 7.1.3. To what
extent were human resources sufficient to meet the programme's objectives?", page=87, position=1, new="# 7.1.3. To 
what extent were human resources sufficient to meet the programme's objectives?", changed=False), 
HeadingResult(old='# 7.1.4. To what extent were the programme activities implemented according to the initial 
timeline?', page=88, position=1, new='# 7.1.4. To what extent were the programme activities implemented according 
to the initial timeline?', changed=False), HeadingResult(old='# 7.2. Cost-effectiveness and efficiency of the 
programme', page=89, position=1, new='# 7.2. Cost-effectiveness and efficiency of the programme', changed=False), 
HeadingResult(old='### 7.2.1. How well were the resources (funds, expertise and time) converted into results?', 
page=89, position=2, new='### 7.2.1. How well were the resources (funds, expertise and time) converted into 
results?', changed=False), HeadingResult(old='# Increased efficiency through partnerships and capacity building', 
page=90, position=1, new='### Increased efficiency through partnerships and capacity building', changed=True), 
HeadingResult(old='# 7.2.2. Could the programme have been implemented in a more cost-effective manner? If so, 
how?', page=91, position=1, new='# 7.2.2. Could the programme have been implemented in a more cost-effective 
manner? If so, how?', changed=False), HeadingResult(old='## Improved efficiency of some activities', page=91, 
position=2, new='## Improved efficiency of some activities', changed=True), HeadingResult(old='# Decreased 
efficiency of some activities', page=92, position=1, new='### Decreased efficiency of some activities', 
changed=True), HeadingResult(old='# Examples of cost reduction', page=93, position=1, new='### Examples of cost 
reduction', changed=True), HeadingResult(old='# Lessons learned in relation to cost-effectiveness', page=94, 
position=1, new='### Lessons learned in relation to cost-effectiveness', changed=True), HeadingResult(old='### 
7.2.3. To what extent did the programme make efficiency gains by relying on existing services?', page=94, 
position=2, new='### 7.2.3. To what extent did the programme make efficiency gains by relying on existing 
services?', changed=False), HeadingResult(old='# 7.2.4. To what extent did the national referral mechanisms 
function effectively enough to support the JI-HoA?', page=97, position=1, new='# 7.2.4. To what extent did the 
national referral mechanisms function effectively enough to support the JI-HoA?', changed=False), 
HeadingResult(old='# 8. Sustainability', page=100, position=1, new='# 8. Sustainability', changed=False), 
HeadingResult(old='## Overall performance score for sustainability: 2.5/5.', page=100, position=2, new='## Overall 
performance score for sustainability: 2.5/5.', changed=False), HeadingResult(old='## Robustness score for the 
evidence: 4.5/5.', page=100, position=3, new='## Robustness score for the evidence: 4.5/5.', changed=False), 
HeadingResult(old='### 8.1. Main achievements in terms of the technical, managerial and financial capacity of 
governments and other stakeholders to continue working on return and reintegration', page=100, position=4, new='###
8.1. Main achievements in terms of the technical, managerial and financial capacity of governments and other 
stakeholders to continue working on return and reintegration', changed=False), HeadingResult(old='# 8.2. Main 
challenges in terms of the technical, managerial, and financial capacity of governments and other stakeholders to 
continue working on return and reintegration', page=102, position=1, new='# 8.2. Main challenges in terms of the 
technical, managerial, and financial capacity of governments and other stakeholders to continue working on return 
and reintegration', changed=False), HeadingResult(old='# Part 3: Conclusions and Recommendations', page=104, 
position=1, new='# Part 3: Conclusions and Recommendations', changed=False), HeadingResult(old='# 9. Conclusions', 
page=105, position=1, new='# 9. Conclusions', changed=False), HeadingResult(old='# 10. Recommendations', page=107, 
position=1, new='# 10. Recommendations', changed=False), HeadingResult(old='## 1. Enhance efforts with national, 
regional and local stakeholders to build capacity and ownership (while continuing the provision of funding).', 
page=107, position=2, new='## 1. Enhance efforts with national, regional and local stakeholders to build capacity 
and ownership (while continuing the provision of funding).', changed=True), HeadingResult(old='# 3. Increase 
attention on building partnerships with service providers who can function without (significant) funding channelled
by IOM.', page=108, position=1, new='# 3. Increase attention on building partnerships with service providers who 
can function without (significant) funding channelled by IOM.', changed=True), HeadingResult(old='# 5. Explore 
opportunities to extend the scope of support provided to returnees, with a focus on longer-term reintegration.', 
page=109, position=1, new='# 5. Explore opportunities to extend the scope of support provided to returnees, with a 
focus on longer-term reintegration.', changed=True), HeadingResult(old='# ANNEXES', page=111, position=1, new='# 
ANNEXES', changed=False), HeadingResult(old='# Annex 1. Evaluation framework', page=112, position=1, new='# Annex 
1. Evaluation framework', changed=False), HeadingResult(old='# TABLE 13. EVALUATION QUESTIONS FOR THE 
SUSTAINABILITY CRITERION', page=122, position=1, new='# TABLE 13. EVALUATION QUESTIONS FOR THE SUSTAINABILITY 
CRITERION', changed=False), HeadingResult(old='# Annex 2. Indicators, targets and achievements', page=125, 
position=1, new='# Annex 2. Indicators, targets and achievements', changed=False), HeadingResult(old='# Annex 3. 
IOM performance scores and methodology', page=131, position=1, new='# Annex 3. IOM performance scores and 
methodology', changed=False), HeadingResult(old='# Relevance', page=132, position=1, new='# Relevance', 
changed=False), HeadingResult(old='# Coherence', page=133, position=1, new='# Coherence', changed=False), 
HeadingResult(old='## Effectiveness', page=133, position=2, new='## Effectiveness', changed=False), 
HeadingResult(old='# Efficiency', page=136, position=1, new='# Efficiency', changed=False), HeadingResult(old='## 
Sustainability', page=136, position=2, new='## Sustainability', changed=False), HeadingResult(old='# Annex 4. 
Assessment of indicators', page=137, position=1, new='# Annex 4. Assessment of indicators', changed=False), 
HeadingResult(old='# Annex 5. Members of the PSCs', page=142, position=1, new='# Annex 5. Members of the PSCs', 
changed=False)]
)
Usage:
{
    'gemini/gemini-2.0-flash-exp': {
        'completion_tokens': 8120,
        'prompt_tokens': 3360,
        'total_tokens': 11480,
        'completion_tokens_details': None,
        'prompt_tokens_details': {
            'audio_tokens': None,
            'cached_tokens': None,
            'text_tokens': 3360,
            'image_tokens': None
        }
    }
}

source

group_corrections_by_page

 group_corrections_by_page (results:list[__main__.HeadingResult])

Group HeadingResult corrections by page number into dict with page nums as keys

Type Details
results list List of headings with corrections and change status
Exported source
def group_corrections_by_page(
    results: list[HeadingResult], # List of headings with corrections and change status
    ):
    "Group HeadingResult corrections by page number into dict with page nums as keys"
    page_groups = {}
    for result in results:
        page = result.page
        if page not in page_groups:
            page_groups[page] = []
        page_groups[page].append(result)
    return page_groups
group_corrections_by_page(result.results)
{1: [HeadingResult(old='# **PPMi**', page=1, position=1, new='# **PPMi**', changed=False)],
 5: [HeadingResult(old='# LIST OF FIGURES', page=5, position=1, new='# LIST OF FIGURES', changed=False)],
 6: [HeadingResult(old='# Abbreviations and terminology', page=6, position=1, new='# Abbreviations and terminology', changed=False)],
 8: [HeadingResult(old='# Key terminology', page=8, position=1, new='# Key terminology', changed=False)],
 10: [HeadingResult(old='# Executive summary', page=10, position=1, new='# Executive summary', changed=False),
  HeadingResult(old='## Background', page=10, position=2, new='## Background', changed=False)],
 11: [HeadingResult(old='# Methodology', page=11, position=1, new='# Methodology', changed=False)],
 12: [HeadingResult(old='# Findings', page=12, position=1, new='# Findings', changed=False),
  HeadingResult(old='## Relevance', page=12, position=2, new='## Relevance', changed=False)],
 13: [HeadingResult(old='# Coherence', page=13, position=1, new='# Coherence', changed=False),
  HeadingResult(old='## $4.3 / 5$', page=13, position=2, new='## $4.3 / 5$', changed=True)],
 14: [HeadingResult(old='# Effectiveness', page=14, position=1, new='# Effectiveness', changed=False),
  HeadingResult(old='## Specific Outcome 1:', page=14, position=2, new='## Specific Outcome 1:', changed=False),
  HeadingResult(old='## Specific Outcome 2:', page=14, position=3, new='## Specific Outcome 2:', changed=False)],
 15: [HeadingResult(old='# Specific Outcome 3:', page=15, position=1, new='# Specific Outcome 3:', changed=False),
  HeadingResult(old='## Efficiency', page=15, position=2, new='## Efficiency', changed=False)],
 16: [HeadingResult(old='# Sustainability', page=16, position=1, new='# Sustainability', changed=False),
  HeadingResult(old='## Conclusions and recommendations', page=16, position=2, new='## Conclusions and recommendations', changed=False)],
 18: [HeadingResult(old='# 1. Introduction', page=18, position=1, new='# 1. Introduction', changed=False)],
 19: [HeadingResult(old='# Part 1: Background and methodology', page=19, position=1, new='# Part 1: Background and methodology', changed=False)],
 20: [HeadingResult(old='# 2. Background to the JI-HoA', page=20, position=1, new='# 2. Background to the JI-HoA', changed=False),
  HeadingResult(old='### 2.1. Context and design of the JI-HoA', page=20, position=2, new='## 2.1. Context and design of the JI-HoA', changed=False)],
 23: [HeadingResult(old='# 2.2. External factors affecting the implementation of the JI-HoA', page=23, position=1, new='# 2.2. External factors affecting the implementation of the JI-HoA', changed=False)],
 26: [HeadingResult(old='# 3. Methodology of the evaluation', page=26, position=1, new='# 3. Methodology of the evaluation', changed=False),
  HeadingResult(old='### 3.1. Evaluation framework', page=26, position=2, new='## 3.1. Evaluation framework', changed=False)],
 27: [HeadingResult(old='# TABLE 2. INTERVENTION LOGIC', page=27, position=1, new='# TABLE 2. INTERVENTION LOGIC', changed=False)],
 29: [HeadingResult(old='# 3.2. Evaluation matrix', page=29, position=1, new='# 3.2. Evaluation matrix', changed=False),
  HeadingResult(old='### 3.3. Data collection', page=29, position=2, new='## 3.3. Data collection', changed=False)],
 30: [HeadingResult(old='# 3.4. Scoring system', page=30, position=1, new='# 3.4. Scoring system', changed=False)],
 32: [HeadingResult(old='# 3.5. Limitations', page=32, position=1, new='# 3.5. Limitations', changed=False)],
 33: [HeadingResult(old='# Part 2: Findings', page=33, position=1, new='# Part 2: Findings', changed=False)],
 34: [HeadingResult(old='# 4. Relevance', page=34, position=1, new='# 4. Relevance', changed=False),
  HeadingResult(old='## Overall performance score for relevance: 3.9/5.', page=34, position=2, new='## Overall performance score for relevance: 3.9/5.', changed=False),
  HeadingResult(old='## Robustness score for the evidence: 4.5/5.', page=34, position=3, new='## Robustness score for the evidence: 4.5/5.', changed=False),
  HeadingResult(old='### 4.1. Relevance of programme activities for migrants, returnees, and communities', page=34, position=4, new='### 4.1. Relevance of programme activities for migrants, returnees, and communities', changed=False),
  HeadingResult(old='### 4.1.1. Needs of migrants', page=34, position=5, new='#### 4.1.1. Needs of migrants', changed=False)],
 35: [HeadingResult(old='# 4.1.2. Needs of returnees', page=35, position=1, new='# 4.1.2. Needs of returnees', changed=False)],
 38: [HeadingResult(old='# 4.1.3. Needs of community members', page=38, position=1, new='# 4.1.3. Needs of community members', changed=False)],
 39: [HeadingResult(old="# 4.2. Programme's relevance to the needs of stakeholders", page=39, position=1, new="# 4.2. Programme's relevance to the needs of stakeholders", changed=False),
  HeadingResult(old='### 4.2.1. Needs of governments', page=39, position=2, new='### 4.2.1. Needs of governments', changed=False)],
 40: [HeadingResult(old='# 4.2.2. Needs of other stakeholders', page=40, position=1, new='# 4.2.2. Needs of other stakeholders', changed=False)],
 41: [HeadingResult(old='# 4.3. Involvement of stakeholders in the design, implementation and monitoring of the programme', page=41, position=1, new='# 4.3. Involvement of stakeholders in the design, implementation and monitoring of the programme', changed=False)],
 42: [HeadingResult(old='# 4.4. Horizontal priorities', page=42, position=1, new='# 4.4. Horizontal priorities', changed=False),
  HeadingResult(old='### 4.4.1. Gender equality', page=42, position=2, new='### 4.4.1. Gender equality', changed=False)],
 43: [HeadingResult(old='# 4.4.2. Persons with disabilities', page=43, position=1, new='# 4.4.2. Persons with disabilities', changed=False),
  HeadingResult(old='### 4.4.3. Protection', page=43, position=2, new='### 4.4.3. Protection', changed=False)],
 44: [HeadingResult(old='# 4.4.4. Environmental sustainability', page=44, position=1, new='# 4.4.4. Environmental sustainability', changed=False)],
 46: [HeadingResult(old='# 5. Coherence', page=46, position=1, new='# 5. Coherence', changed=False),
  HeadingResult(old='## Overall performance score for coherence: $4.3 / 5$.', page=46, position=2, new='## Overall performance score for coherence: $4.3 / 5$.', changed=False),
  HeadingResult(old='## Robustness score for the evidence: $4 / 5$.', page=46, position=3, new='## Robustness score for the evidence: $4 / 5$.', changed=False),
  HeadingResult(old="### 5.1. The JI-HoA's alignment with the objectives and standards of IOM, and the objectives of the EU", page=46, position=4, new="### 5.1. The JI-HoA's alignment with the objectives and standards of IOM, and the objectives of the EU", changed=False),
  HeadingResult(old='### 5.1.1. Objectives of the IOM', page=46, position=5, new='#### 5.1.1. Objectives of the IOM', changed=False)],
 48: [HeadingResult(old='# 5.1.2. Objectives of the EU', page=48, position=1, new='# 5.1.2. Objectives of the EU', changed=False)],
 49: [HeadingResult(old='# 5.1.3. Government initiatives', page=49, position=1, new='# 5.1.3. Government initiatives', changed=False)],
 50: [HeadingResult(old='# 5.2. Alignment with other initiatives', page=50, position=1, new='# 5.2. Alignment with other initiatives', changed=False),
  HeadingResult(old='### 5.2.1. Initiatives of regional and continental institutions', page=50, position=2, new='### 5.2.1. Initiatives of regional and continental institutions', changed=False)],
 51: [HeadingResult(old='# 5.2.2. Initiatives by other (UN) organisations', page=51, position=1, new='# 5.2.2. Initiatives by other (UN) organisations', changed=False)],
 54: [HeadingResult(old='# 6. Effectiveness and impact', page=54, position=1, new='# 6. Effectiveness and impact', changed=False),
  HeadingResult(old='## Overall performance score for effectiveness: 3.8/5.', page=54, position=2, new='## Overall performance score for effectiveness: 3.8/5.', changed=False),
  HeadingResult(old="## Overall score on IOM's achievements: 3.4/5.", page=54, position=3, new="## Overall score on IOM's achievements: 3.4/5.", changed=False),
  HeadingResult(old='## Robustness score for the evidence: $4 / 5$.', page=54, position=4, new='## Robustness score for the evidence: $4 / 5$.', changed=False),
  HeadingResult(old="### 6.1. Design and achievement of targets for the programme's indicators", page=54, position=5, new="### 6.1. Design and achievement of targets for the programme's indicators", changed=False)],
 57: [HeadingResult(old='# 6.2. Specific Objective 1: partner countries and relevant stakeholders developed or strengthened evidence-based return and reintegration procedures', page=57, position=1, new='# 6.2. Specific Objective 1: partner countries and relevant stakeholders developed or strengthened evidence-based return and reintegration procedures', changed=False),
  HeadingResult(old='### 6.2.1. Achievement of outputs and results', page=57, position=2, new='### 6.2.1. Achievement of outputs and results', changed=False),
  HeadingResult(old='## Data availability', page=57, position=3, new='#### Data availability', changed=True)],
 58: [HeadingResult(old='# Capacity of stakeholders', page=58, position=1, new='### Capacity of stakeholders', changed=True)],
 61: [HeadingResult(old='# Capacity of the African Union Commission', page=61, position=1, new='### Capacity of the African Union Commission', changed=True)],
 62: [HeadingResult(old='# 6.2.2. Achievement of Specific Objective 1', page=62, position=1, new='# 6.2.2. Achievement of Specific Objective 1', changed=False)],
 64: [HeadingResult(old='# 6.3. Specific Objective 2: safe, humane, dignified voluntary return processes are enhanced along main migration routes', page=64, position=1, new='# 6.3. Specific Objective 2: safe, humane, dignified voluntary return processes are enhanced along main migration routes', changed=False),
  HeadingResult(old='### 6.3.1. Achievement of outputs and results', page=64, position=2, new='### 6.3.1. Achievement of outputs and results', changed=False),
  HeadingResult(old='## Outreach and awareness', page=64, position=3, new='#### Outreach and awareness', changed=True)],
 66: [HeadingResult(old='# Assistance to stranded migrants', page=66, position=1, new='### Assistance to stranded migrants', changed=True)],
 67: [HeadingResult(old='# 6.3.2. Achievement of Specific Objective 2', page=67, position=1, new='# 6.3.2. Achievement of Specific Objective 2', changed=False),
  HeadingResult(old='## 6.4. Specific Objective 3: returnees are sustainably integrated in host communities, and host communities are better able to create living standards that address drivers of migration.', page=67, position=2, new='## 6.4. Specific Objective 3: returnees are sustainably integrated in host communities, and host communities are better able to create living standards that address drivers of migration.', changed=False)],
 68: [HeadingResult(old='# 6.4.1. Achievement of outputs and results', page=68, position=1, new='# 6.4.1. Achievement of outputs and results', changed=True),
  HeadingResult(old='## Individual and community-based reintegration', page=68, position=2, new='## Individual and community-based reintegration', changed=True)],
 70: [HeadingResult(old='# M\\&E systems', page=70, position=1, new='### M&E systems', changed=True)],
 71: [HeadingResult(old='# 6.4.2. Achievement of Specific Objective 3', page=71, position=1, new='# 6.4.2. Achievement of Specific Objective 3', changed=False),
  HeadingResult(old='## Overall achievement of reintegration', page=71, position=2, new='## Overall achievement of reintegration', changed=True)],
 77: [HeadingResult(old='# 6.5. Assessing vulnerabilities', page=77, position=1, new='### 6.5. Assessing vulnerabilities', changed=True),
  HeadingResult(old='### 6.5.1. Achievements and challenges in screening migrant vulnerabilities and assessing eligibility for support', page=77, position=2, new='#### 6.5.1. Achievements and challenges in screening migrant vulnerabilities and assessing eligibility for support', changed=False)],
 79: [HeadingResult(old='# 6.5.2. Contact and communication with beneficiaries', page=79, position=1, new='# 6.5.2. Contact and communication with beneficiaries', changed=False)],
 81: [HeadingResult(old='# 6.6. Functioning of the integrated approach', page=81, position=1, new='# 6.6. Functioning of the integrated approach', changed=False)],
 83: [HeadingResult(old='# 7. Efficiency', page=83, position=1, new='# 7. Efficiency', changed=False),
  HeadingResult(old='## Overall performance score for efficiency: 4.3/5.', page=83, position=2, new='## Overall performance score for efficiency: 4.3/5.', changed=False),
  HeadingResult(old='## Robustness score for the evidence: 3.5/5.', page=83, position=3, new='## Robustness score for the evidence: 3.5/5.', changed=False),
  HeadingResult(old='### 7.1. Did the programme receive sufficient resources to achieve its objectives?', page=83, position=4, new='### 7.1. Did the programme receive sufficient resources to achieve its objectives?', changed=False),
  HeadingResult(old="### 7.1.1. To what extent were financial resources sufficient to meet the programme's objectives?", page=83, position=5, new="#### 7.1.1. To what extent were financial resources sufficient to meet the programme's objectives?", changed=False)],
 86: [HeadingResult(old='# 7.1.2. To what extent was the "top-up" funding system efficient for planning and budgeting?', page=86, position=1, new='# 7.1.2. To what extent was the "top-up" funding system efficient for planning and budgeting?', changed=False)],
 87: [HeadingResult(old="# 7.1.3. To what extent were human resources sufficient to meet the programme's objectives?", page=87, position=1, new="# 7.1.3. To what extent were human resources sufficient to meet the programme's objectives?", changed=False)],
 88: [HeadingResult(old='# 7.1.4. To what extent were the programme activities implemented according to the initial timeline?', page=88, position=1, new='# 7.1.4. To what extent were the programme activities implemented according to the initial timeline?', changed=False)],
 89: [HeadingResult(old='# 7.2. Cost-effectiveness and efficiency of the programme', page=89, position=1, new='# 7.2. Cost-effectiveness and efficiency of the programme', changed=False),
  HeadingResult(old='### 7.2.1. How well were the resources (funds, expertise and time) converted into results?', page=89, position=2, new='### 7.2.1. How well were the resources (funds, expertise and time) converted into results?', changed=False)],
 90: [HeadingResult(old='# Increased efficiency through partnerships and capacity building', page=90, position=1, new='### Increased efficiency through partnerships and capacity building', changed=True)],
 91: [HeadingResult(old='# 7.2.2. Could the programme have been implemented in a more cost-effective manner? If so, how?', page=91, position=1, new='# 7.2.2. Could the programme have been implemented in a more cost-effective manner? If so, how?', changed=False),
  HeadingResult(old='## Improved efficiency of some activities', page=91, position=2, new='## Improved efficiency of some activities', changed=True)],
 92: [HeadingResult(old='# Decreased efficiency of some activities', page=92, position=1, new='### Decreased efficiency of some activities', changed=True)],
 93: [HeadingResult(old='# Examples of cost reduction', page=93, position=1, new='### Examples of cost reduction', changed=True)],
 94: [HeadingResult(old='# Lessons learned in relation to cost-effectiveness', page=94, position=1, new='### Lessons learned in relation to cost-effectiveness', changed=True),
  HeadingResult(old='### 7.2.3. To what extent did the programme make efficiency gains by relying on existing services?', page=94, position=2, new='### 7.2.3. To what extent did the programme make efficiency gains by relying on existing services?', changed=False)],
 97: [HeadingResult(old='# 7.2.4. To what extent did the national referral mechanisms function effectively enough to support the JI-HoA?', page=97, position=1, new='# 7.2.4. To what extent did the national referral mechanisms function effectively enough to support the JI-HoA?', changed=False)],
 100: [HeadingResult(old='# 8. Sustainability', page=100, position=1, new='# 8. Sustainability', changed=False),
  HeadingResult(old='## Overall performance score for sustainability: 2.5/5.', page=100, position=2, new='## Overall performance score for sustainability: 2.5/5.', changed=False),
  HeadingResult(old='## Robustness score for the evidence: 4.5/5.', page=100, position=3, new='## Robustness score for the evidence: 4.5/5.', changed=False),
  HeadingResult(old='### 8.1. Main achievements in terms of the technical, managerial and financial capacity of governments and other stakeholders to continue working on return and reintegration', page=100, position=4, new='### 8.1. Main achievements in terms of the technical, managerial and financial capacity of governments and other stakeholders to continue working on return and reintegration', changed=False)],
 102: [HeadingResult(old='# 8.2. Main challenges in terms of the technical, managerial, and financial capacity of governments and other stakeholders to continue working on return and reintegration', page=102, position=1, new='# 8.2. Main challenges in terms of the technical, managerial, and financial capacity of governments and other stakeholders to continue working on return and reintegration', changed=False)],
 104: [HeadingResult(old='# Part 3: Conclusions and Recommendations', page=104, position=1, new='# Part 3: Conclusions and Recommendations', changed=False)],
 105: [HeadingResult(old='# 9. Conclusions', page=105, position=1, new='# 9. Conclusions', changed=False)],
 107: [HeadingResult(old='# 10. Recommendations', page=107, position=1, new='# 10. Recommendations', changed=False),
  HeadingResult(old='## 1. Enhance efforts with national, regional and local stakeholders to build capacity and ownership (while continuing the provision of funding).', page=107, position=2, new='## 1. Enhance efforts with national, regional and local stakeholders to build capacity and ownership (while continuing the provision of funding).', changed=True)],
 108: [HeadingResult(old='# 3. Increase attention on building partnerships with service providers who can function without (significant) funding channelled by IOM.', page=108, position=1, new='# 3. Increase attention on building partnerships with service providers who can function without (significant) funding channelled by IOM.', changed=True)],
 109: [HeadingResult(old='# 5. Explore opportunities to extend the scope of support provided to returnees, with a focus on longer-term reintegration.', page=109, position=1, new='# 5. Explore opportunities to extend the scope of support provided to returnees, with a focus on longer-term reintegration.', changed=True)],
 111: [HeadingResult(old='# ANNEXES', page=111, position=1, new='# ANNEXES', changed=False)],
 112: [HeadingResult(old='# Annex 1. Evaluation framework', page=112, position=1, new='# Annex 1. Evaluation framework', changed=False)],
 122: [HeadingResult(old='# TABLE 13. EVALUATION QUESTIONS FOR THE SUSTAINABILITY CRITERION', page=122, position=1, new='# TABLE 13. EVALUATION QUESTIONS FOR THE SUSTAINABILITY CRITERION', changed=False)],
 125: [HeadingResult(old='# Annex 2. Indicators, targets and achievements', page=125, position=1, new='# Annex 2. Indicators, targets and achievements', changed=False)],
 131: [HeadingResult(old='# Annex 3. IOM performance scores and methodology', page=131, position=1, new='# Annex 3. IOM performance scores and methodology', changed=False)],
 132: [HeadingResult(old='# Relevance', page=132, position=1, new='# Relevance', changed=False)],
 133: [HeadingResult(old='# Coherence', page=133, position=1, new='# Coherence', changed=False),
  HeadingResult(old='## Effectiveness', page=133, position=2, new='## Effectiveness', changed=False)],
 136: [HeadingResult(old='# Efficiency', page=136, position=1, new='# Efficiency', changed=False),
  HeadingResult(old='## Sustainability', page=136, position=2, new='## Sustainability', changed=False)],
 137: [HeadingResult(old='# Annex 4. Assessment of indicators', page=137, position=1, new='# Annex 4. Assessment of indicators', changed=False)],
 142: [HeadingResult(old='# Annex 5. Members of the PSCs', page=142, position=1, new='# Annex 5. Members of the PSCs', changed=False)]}

source

apply_corrections_to_page

 apply_corrections_to_page (page_nb, corrections, enhanced_path)

Apply corrections to a page in the enhanced directory

Details
page_nb Page number
corrections List of corrections
enhanced_path Path to enhanced directory
Exported source
def apply_corrections_to_page(
    page_nb, # Page number
    corrections, # List of corrections
    enhanced_path, # Path to enhanced directory
    ):
    "Apply corrections to a page in the enhanced directory"
    page_file = enhanced_path / f"page_{page_nb}.md"
    lines = page_file.read_text().splitlines()
    corrections_copy = corrections.copy()
    
    for i, line in enumerate(lines):
        for correction in corrections_copy:
            if line.strip() == correction.old.strip():
                lines[i] = f"{correction.new} .... page {page_nb}"
                corrections_copy.remove(correction)
                break
            
    page_file.write_text('\n'.join(lines))
enhanced_path = doc / cfg.enhanced_dir
apply_corrections_to_page(5, result.results, enhanced_path)

source

apply_all_corrections

 apply_all_corrections (results, enhanced_path)

Apply all corrections to the pages in enhanced directory

Details
results List of headings with corrections and change status
enhanced_path Path to enhanced directory
Exported source
def apply_all_corrections(
    results, # List of headings with corrections and change status
    enhanced_path, # Path to enhanced directory
    ):
    "Apply all corrections to the pages in enhanced directory"
    grouped = group_corrections_by_page(results)
    for page_nb, corrections in grouped.items(): 
        apply_corrections_to_page(page_nb, corrections, enhanced_path)
apply_all_corrections(result.results, enhanced_path)

source

fix_doc_hdgs

 fix_doc_hdgs (src_dir, force=False)

Process the document directory

Type Default Details
src_dir Path to the folder containing the document
force bool False Whether to overwrite the existing enhanced directory
Exported source
def fix_doc_hdgs(
    src_dir, # Path to the folder containing the document
    force=False, # Whether to overwrite the existing enhanced directory
    ):
    "Process the document directory"
    src_path = Path(src_dir)
    enhanced_path = src_path / cfg.enhanced_dir
    
    if enhanced_path.exists() and not force:
        print(f"Enhanced directory '{cfg.enhanced_dir}' already exists. Use force=True to overwrite.")
        return
    if enhanced_path.exists() and force: 
        shutil.rmtree(enhanced_path)
    
    enhanced_path = setup_enhanced_dir(src_dir)
    pages = enhanced_path.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1]))
    result = fix_md(get_hdgs_with_pages(pages))
    apply_all_corrections(result.results, enhanced_path)
print(doc)
fix_doc_hdgs(doc, force=True)
../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf

Enrich with figures description

# doc = src_dir / 'abridged_evaluation_report_final_olta_ndoja_pdf/enhanced'
doc = src_dir / 'final_evaluation_report_final_olta_ndoja_pdf/enhanced'
pages = doc.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1])); pages
(#142) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_20.md')...]

source

has_images

 has_images (page_path)
Exported source
def has_images(page_path):
    content = Path(page_path).read_text()
    return bool(re.search(r'!\[[^\]]*\]\([^)]+\)', content))

For instance:

[page for page in pages if has_images(page)]
[Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_1.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_11.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_12.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_14.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_15.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_16.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_21.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_22.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_23.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_29.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_30.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_38.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_59.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_60.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_63.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_68.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_84.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_95.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_114.md')]

source

MarkdownPage

 MarkdownPage (path)

A class to represent a markdown page

Exported source
class MarkdownPage: 
    "A class to represent a markdown page"
    def __init__(self, path): self.path = Path(path)

source

ImgRef

A class to represent a image reference

Exported source
class ImgRef(AttrDict):
    "A class to represent a image reference"
    def __repr__(self):
        clean_context = self.context.replace('\n', ' ')[:50] + "..."
        fields = [f"filename='{self.filename}'", f"context='{clean_context}'"]
        if hasattr(self, 'is_relevant'): fields.append(f"is_relevant={self.is_relevant}")
        if hasattr(self, 'reason'): fields.append(f"reason={self.reason}")
        # ... add other fields if present
        return f"ImgRef({', '.join(fields)})"

source

MarkdownPage.find_img_refs

 MarkdownPage.find_img_refs (context_lines:int=3)

Find all image references in the markdown page and include the context around the image

Type Default Details
context_lines int 3 Number of lines of context to include around the image
Exported source
@patch
def find_img_refs(
    self:MarkdownPage, # Markdown page of interest
    context_lines: int = 3, # Number of lines of context to include around the image
    ):
    "Find all image references in the markdown page and include the context around the image"
    content = self.path.read_text()
    lines = content.splitlines()
    results = []
    
    for i, line in enumerate(lines):
        if re.search(r'!\[[^\]]*\]\(([^)]+)\)', line):
            # Extract context around this line
            start = max(0, i - context_lines)
            end = min(len(lines), i + context_lines + 1)
            context = '\n'.join(lines[start:end])
            
            # Extract image filename
            match = re.search(r'!\[[^\]]*\]\(([^)]+)\)', line)
            results.append(ImgRef({
                "filename": match.group(1),
                "context": context
            }))
    
    return results

For instance:

for page in pages: 
    img_refs = MarkdownPage(page).find_img_refs()
    if img_refs: print(f"In {page.stem}: {img_refs}")
In page_1: [ImgRef(filename='img-0.jpeg', context=' Final Evaluation Report, 17 March 2023  !
In page_11: [ImgRef(filename='img-1.jpeg', context='!(img-1.jpeg)  Source: IOM Reintegrati...')]
In page_12: [ImgRef(filename='img-2.jpeg', context='# Findings .... page 12  As part of the Final Inde...'), 
ImgRef(filename='img-3.jpeg', context=' ## Relevance .... page 12  !(img-3.jp...')]
In page_14: [ImgRef(filename='img-4.jpeg', context=' # Effectiveness .... page 14  !(img-4...')]
In page_15: [ImgRef(filename='img-5.jpeg', context=' ## Efficiency .... page 15  !(img-5.j...')]
In page_16: [ImgRef(filename='img-6.jpeg', context='# Sustainability .... page 16  !(img-6...')]
In page_21: [ImgRef(filename='img-7.jpeg', context='through the development and implementation of righ...')]
In page_22: [ImgRef(filename='img-8.jpeg', context='!(img-8.jpeg)  Source: IOM Reintegrati...')]
In page_23: [ImgRef(filename='img-9.jpeg', context=' Aside from these three vertical levels, the integ...')]
In page_29: [ImgRef(filename='img-10.jpeg', context='The evaluation was carried out in accordance with ...')]
In page_30: [ImgRef(filename='img-11.jpeg', context='As part of the Final Independent Evaluation of the...')]
In page_38: [ImgRef(filename='img-12.jpeg', context='According to the community participation survey ad...')]
In page_59: [ImgRef(filename='img-13.jpeg', context='FIGURE 4: NUMBER OF STAKEHOLDERS SUPPORTED BY THE ...')]
In page_60: [ImgRef(filename='img-14.jpeg', context='FIGURE 5. INCREASED KNOWLEDGE/EXPERTISE REPORTED B...')]
In page_63: [ImgRef(filename='img-15.jpeg', context='FIGURE 6. NUMBER OF ACTORS INVOLVED IN RETURN AND ...')]
In page_68: [ImgRef(filename='img-16.jpeg', context='By September 2022, the JI-HoA had provided reinteg...')]
In page_84: [ImgRef(filename='img-17.jpeg', context='respectively), and additional activities, such as ...')]
In page_95: [ImgRef(filename='img-18.jpeg', context='The Mid Term Review of the programme in 2019 concl...')]
In page_114: [ImgRef(filename='img-19.jpeg', context='!(img-19.jpeg)  Coherence refers to t...')]

source

ImageRelevance

 ImageRelevance (img_filename:str, surrounding_context:str,
                 is_relevant:bool, reason:str)

*Determine if an image contains substantive content for document understanding.

RELEVANT: Charts, graphs, diagrams, figures, tables, screenshots, flowcharts IRRELEVANT: Logos, cover images, decorative elements, headers, footers*

Exported source
class ImageRelevance(dspy.Signature):
    """Determine if an image contains substantive content for document understanding.
    
    RELEVANT: Charts, graphs, diagrams, figures, tables, screenshots, flowcharts
    IRRELEVANT: Logos, cover images, decorative elements, headers, footers
    """
    img_filename: str = dspy.InputField()
    surrounding_context: str = dspy.InputField(desc="Text context around the image")
    is_relevant: bool = dspy.OutputField(desc="True only for substantive content like data visualizations")
    reason: str = dspy.OutputField(desc="Brief explanation of decision")

source

MarkdownPage.classify_imgs

 MarkdownPage.classify_imgs (img_refs:list[__main__.ImgRef])

Classify images in the markdown page

Type Details
img_refs list List of image references
Exported source
@patch
def classify_imgs(
    self:MarkdownPage, # Markdown page of interest
    img_refs: list[ImgRef], # List of image references
    ):
    "Classify images in the markdown page"
    classifier = dspy.ChainOfThought(ImageRelevance)
    for img_ref in img_refs:
        result = classifier(
            img_filename=img_ref.filename,
            surrounding_context=img_ref.context,
            page_nb=1  # We could make this dynamic if needed
        )
        img_ref.is_relevant = result.is_relevant
        img_ref.reason = result.reason
    return img_refs

For instance:

img_refs = MarkdownPage(pages[0]).find_img_refs(); print(img_refs)
md_page = MarkdownPage(pages[5]) 
img_refs = md_page.find_img_refs()
clf_img_refs = md_page.classify_imgs(img_refs)
print(clf_img_refs)
[ImgRef(filename='img-0.jpeg', context=' Final Evaluation Report, 17 March 2023  ![img-0.j...')]
[]

source

describe_img

 describe_img (img_path:pathlib.Path, context:str, api_key:str=None,
               model:str='gemini/gemini-2.0-flash-exp')

Describe an image using an LLM

Type Default Details
img_path Path Path to the image
context str Context of the image
api_key str None API key for the Gemini model
model str gemini/gemini-2.0-flash-exp Model to use
Exported source
def describe_img(
    img_path: Path, # Path to the image
    context: str, # Context of the image
    api_key: str = GEMINI_API_KEY, # API key for the Gemini model
    model: str = cfg.lm, # Model to use
    ):
    "Describe an image using an LLM"
    with open(img_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    # Auto-detect image format
    img_format = img_path.suffix.lower().replace('.', '')
    if img_format == 'jpg': img_format = 'jpeg'
    
    prompt = f"""Provide a concise paragraph description of this image for evaluation report analysis. Include: type of content, main topic, key data/statistics, trends, and takeaways. Write as flowing text, not numbered points. Context: {context}"""
    response = completion(
        model=model,
        messages=[{
            "role": "user", 
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/{img_format};base64,{base64_image}"}}
            ]
        }],
        api_key=api_key
    )
    return response.choices[0].message.content

source

MarkdownPage.describe_imgs

 MarkdownPage.describe_imgs (img_refs:list[__main__.ImgRef], img_dir:str)

Describe images in the markdown page

Type Details
img_refs list List of image references
img_dir str Image directory
Exported source
@patch
def describe_imgs(
    self:MarkdownPage, # Markdown page of interest
    img_refs: list[ImgRef], # List of image references
    img_dir: str # Image directory
    ):
    "Describe images in the markdown page"
    for img_ref in img_refs:
        if img_ref.is_relevant:
            img_path = Path(img_dir) / img_ref.filename
            description = describe_img(img_path, img_ref.context, GEMINI_API_KEY)
            img_ref.description = description
    return img_refs

For instance:

# md_page = MarkdownPage(pages[5]) 
# img_refs = md_page.find_img_refs()
# clf_img_refs = md_page.classify_imgs(img_refs)
# img_refs_desc = md_page.describe_imgs(clf_img_refs, doc.parent / 'img')
# print(img_refs_desc[0].description)

source

MarkdownPage.replace_imgs_with_desc

 MarkdownPage.replace_imgs_with_desc (img_refs,
                                      enriched_dir:str='enriched')

Replace images with their descriptions in the markdown page

Type Default Details
img_refs List of image references
enriched_dir str enriched Enriched directory
Exported source
@patch
def replace_imgs_with_desc(
    self:MarkdownPage, # Markdown page of interest
    img_refs, # List of image references
    enriched_dir: str = cfg.enriched_dir, # Enriched directory
    ):
    "Replace images with their descriptions in the markdown page"
    enriched_path = self.path.parent.parent / enriched_dir
    enriched_path.mkdir(exist_ok=True)
    
    content = self.path.read_text()
    for img_ref in img_refs:
        if img_ref.is_relevant and hasattr(img_ref, 'description'):
            pattern = f'!\\[[^\\]]*\\]\\({re.escape(img_ref.filename)}\\)'
            content = re.sub(pattern, img_ref.description, content)
    
    enriched_file = enriched_path / self.path.name
    enriched_file.write_text(content)
    return enriched_file

source

copy_page_to_enriched

 copy_page_to_enriched (page, enriched_dir:str='enriched')

Copy a page to the enriched directory

Type Default Details
page Page to copy
enriched_dir str enriched Enriched directory
Exported source
def copy_page_to_enriched(
    page, # Page to copy
    enriched_dir: str = cfg.enriched_dir, # Enriched directory
    ):
    "Copy a page to the enriched directory"
    enriched_path = page.parent.parent / enriched_dir
    enriched_path.mkdir(exist_ok=True)
    return shutil.copy(page, enriched_path)

source

process_single_page

 process_single_page (page, img_dir, enriched_dir:str='enriched')

Process a single page

Type Default Details
page Page to process
img_dir Image directory
enriched_dir str enriched Enriched directory
Exported source
def process_single_page(
    page, # Page to process
    img_dir, # Image directory
    enriched_dir: str = cfg.enriched_dir, # Enriched directory
    ):
    "Process a single page"
    md_page = MarkdownPage(page)
    # Pipeline: find → classify → describe → replace
    img_refs = md_page.find_img_refs()
    
    if not img_refs: return copy_page_to_enriched(page, enriched_dir)
    
    classified_refs = md_page.classify_imgs(img_refs)
    time.sleep(0.5)
    described_refs = md_page.describe_imgs(classified_refs, img_dir)
    time.sleep(0.5)
    return md_page.replace_imgs_with_desc(described_refs)

source

enrich_images

 enrich_images (pages_dir, img_dir, n_workers=2)

Enrich images in the pages directory

Type Default Details
pages_dir Pages directory
img_dir Image directory
n_workers int 2 Number of workers
Exported source
def enrich_images(
    pages_dir, # Pages directory
    img_dir, # Image directory
    n_workers=2, # Number of workers
    ):
    "Enrich images in the pages directory"
    pages = Path(pages_dir).ls(file_exts=".md")
    
    pages_with_imgs = []
    for page in pages:
        if has_images(page):
            pages_with_imgs.append(page)
        else:
            copy_page_to_enriched(page)
    
    if pages_with_imgs:
        process_fn = partial(process_single_page, img_dir=img_dir)
        parallel(process_fn, pages_with_imgs, n_workers=n_workers, threadpool=True, progress=True)
        
    print(f"✓ Processed {len(pages)} pages ({len(pages_with_imgs)} with images)")
#enrich_images(doc, doc.parent / 'img', n_workers=1)