Fix, clean markdown headings and enrich it with figures description, …

This module aims to fix and enrich markdown headings from OCR’d PDF files by:

  1. Fixing heading hierarchy that was corrupted during OCR
  2. Adding page numbers to headings for better navigation
  3. Enriching figure references with descriptive text and creating a table of figures
Exported source
cfg = AttrDict({
    'enhanced_dir': 'enhanced',
    'enriched_dir': 'enriched',
    'lm': 'gemini/gemini-2.0-flash',
    'api_key': GEMINI_API_KEY,
    'max_tokens': 8192,
    'track_usage': False,
    'img_dir': 'img'
})

Fixing Markdown Headings

# doc = src_dir / 'abridged_evaluation_report_final_olta_ndoja_pdf'
doc = src_dir / 'final_evaluation_report_final_olta_ndoja_pdf'
pages = doc.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1])); pages
(#142) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_20.md')...]

source

setup_enhanced_dir

 setup_enhanced_dir (src_dir, enhanced_dir_name='enhanced')

Create enhanced directory and copy all markdown files to it

Type Default Details
src_dir Source directory path
enhanced_dir_name str enhanced Name of enhanced subdirectory
Exported source
def setup_enhanced_dir(
    src_dir, # Source directory path
    enhanced_dir_name=cfg.enhanced_dir # Name of enhanced subdirectory
    ):
    "Create enhanced directory and copy all markdown files to it"
    src_path = Path(src_dir)
    enhanced_path = src_path / enhanced_dir_name
    enhanced_path.mkdir(exist_ok=True)
    for f in src_path.ls(file_exts=".md"): shutil.copy(f, enhanced_path)
    return enhanced_path

For instance:

setup_enhanced_dir(doc, 'enhanced')
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced')

source

get_hdgs

 get_hdgs (md_txt)
Exported source
def get_hdgs(md_txt): return re.findall(r'^#+.*$', md_txt, re.MULTILINE)

source

get_hdgs_with_pages

 get_hdgs_with_pages (pages:list[pathlib.Path])

Get headings and the page number they are on

Type Details
pages list List of pages
Exported source
def get_hdgs_with_pages(
    pages: list[Path] # List of pages
    ):
    "Get headings and the page number they are on"
    headings = []
    for i, page in enumerate(pages, 1):  # page numbers start at 1
        page_headings = get_hdgs(page.read_text())
        for o in page_headings: headings.append({'heading': o, 'page': i})
    return headings
hdgs = get_hdgs_with_pages(pages); hdgs[:5]
[{'heading': '# **PPMi**', 'page': 1},
 {'heading': '# LIST OF FIGURES ', 'page': 5},
 {'heading': '# Abbreviations and terminology ', 'page': 6},
 {'heading': '# Key terminology ', 'page': 8},
 {'heading': '# Executive summary ', 'page': 10}]
toc = L([get_hdgs(p.read_text()) for p in pages]).concat(); toc[:5]
(#5) ['# **PPMi**','# LIST OF FIGURES ','# Abbreviations and terminology ','# Key terminology ','# Executive summary ']

source

format_hdgs

 format_hdgs (hdgs:list[dict])

Format headings with page numbers

Type Details
hdgs list List of headings with page numbers
Exported source
def format_hdgs(
    hdgs: list[dict] # List of headings with page numbers
    ):
    "Format headings with page numbers"
    formatted = []
    page_positions = {}
    
    for item in hdgs:
        page = item['page']
        page_positions[page] = page_positions.get(page, 0) + 1
        formatted.append(f"{item['heading']} (Page {page}, Position {page_positions[page]})")
    
    return "\n".join(formatted)
print(format_hdgs(hdgs)[:500])
# **PPMi** (Page 1, Position 1)
# LIST OF FIGURES  (Page 5, Position 1)
# Abbreviations and terminology  (Page 6, Position 1)
# Key terminology  (Page 8, Position 1)
# Executive summary  (Page 10, Position 1)
## Background (Page 10, Position 2)
# Methodology  (Page 11, Position 1)
# Findings  (Page 12, Position 1)
## Relevance (Page 12, Position 2)
# Coherence  (Page 13, Position 1)
## $4.3 / 5$ (Page 13, Position 2)
# Effectiveness  (Page 14, Position 1)
## Specific Outcome 1: (Page 14, Positio

source

HeadingResult

 HeadingResult (old:str, page:int, position:int, new:str, changed:bool)

*!!! abstract “Usage Documentation” Models

A base class for creating Pydantic models.

Attributes: class_vars: The names of the class variables defined on the model. private_attributes: Metadata about the private attributes of the model. signature: The synthesized __init__ [Signature][inspect.Signature] of the model.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.*
Exported source
lm = dspy.LM(cfg.lm, api_key=cfg.api_key)
dspy.configure(lm=lm)
dspy.settings.configure(track_usage=cfg.track_usage)
Exported source
class HeadingResult(BaseModel):
    old: str
    page: int
    position: int
    new: str
    changed: bool  # True if correction was made

source

FixHeadingHierarchy

 FixHeadingHierarchy (headings_with_pages:str,
                      results:List[__main__.HeadingResult])

Fix markdown heading hierarchy by analyzing the document’s numbering patterns: - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.) - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level - When a section number is lower than a previously seen number at the same level (e.g., seeing ‘2.’ after ‘3.1’), it’s likely a subsection or list item, not a main section - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections - Return ALL headings with their corrected form

Exported source
class FixHeadingHierarchy(dspy.Signature):
    """Fix markdown heading hierarchy by analyzing the document's numbering patterns:
    - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.)
    - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level
    - When a section number is lower than a previously seen number at the same level (e.g., seeing '2.' after '3.1'), it's likely a subsection or list item, not a main section
    - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections
    - Return ALL headings with their corrected form
    """
    
    headings_with_pages: str = dspy.InputField(desc="List of headings with page numbers")
    results: List[HeadingResult] = dspy.OutputField(desc="All headings with corrections and change status")

source

fix_md

 fix_md (hdgs:list[dict], track_usage:bool=False)

Fix markdown headings

Type Default Details
hdgs list List of headings with page numbers
track_usage bool False
Exported source
def fix_md(
    hdgs: list[dict], # List of headings with page numbers
    track_usage: bool=cfg.track_usage,
    ):
    "Fix markdown headings"
    lm = dspy.LM(cfg.lm, api_key=cfg.api_key, max_tokens=cfg.max_tokens)
    dspy.configure(lm=lm)
    dspy.settings.configure(track_usage=track_usage)

    inp = format_hdgs(hdgs)
    fix_hdgs = dspy.ChainOfThought(FixHeadingHierarchy)
    result = fix_hdgs(headings_with_pages=inp)
    return result
result = fix_md(hdgs, track_usage=True)

source

group_corrections_by_page

 group_corrections_by_page (results:list[__main__.HeadingResult])

Group HeadingResult corrections by page number into dict with page nums as keys

Type Details
results list List of headings with corrections and change status
Exported source
def group_corrections_by_page(
    results: list[HeadingResult], # List of headings with corrections and change status
    ):
    "Group HeadingResult corrections by page number into dict with page nums as keys"
    page_groups = {}
    for result in results:
        page = result.page
        if page not in page_groups:
            page_groups[page] = []
        page_groups[page].append(result)
    return page_groups
group_corrections_by_page(result.results)

source

apply_corrections_to_page

 apply_corrections_to_page (page_nb, corrections, enhanced_path)

Apply corrections to a page in the enhanced directory

Details
page_nb Page number
corrections List of corrections
enhanced_path Path to enhanced directory
Exported source
def apply_corrections_to_page(
    page_nb, # Page number
    corrections, # List of corrections
    enhanced_path, # Path to enhanced directory
    ):
    "Apply corrections to a page in the enhanced directory"
    page_file = enhanced_path / f"page_{page_nb}.md"
    lines = page_file.read_text().splitlines()
    corrections_copy = corrections.copy()
    
    for i, line in enumerate(lines):
        for correction in corrections_copy:
            if line.strip() == correction.old.strip():
                lines[i] = f"{correction.new} .... page {page_nb}"
                corrections_copy.remove(correction)
                break
            
    page_file.write_text('\n'.join(lines))
enhanced_path = doc / cfg.enhanced_dir
apply_corrections_to_page(5, result.results, enhanced_path)

source

apply_all_corrections

 apply_all_corrections (results, enhanced_path)

Apply all corrections to the pages in enhanced directory

Details
results List of headings with corrections and change status
enhanced_path Path to enhanced directory
Exported source
def apply_all_corrections(
    results, # List of headings with corrections and change status
    enhanced_path, # Path to enhanced directory
    ):
    "Apply all corrections to the pages in enhanced directory"
    grouped = group_corrections_by_page(results)
    for page_nb, corrections in grouped.items(): 
        apply_corrections_to_page(page_nb, corrections, enhanced_path)
apply_all_corrections(result.results, enhanced_path)

source

fix_doc_hdgs

 fix_doc_hdgs (src_dir, force=False)

Process the document directory

Type Default Details
src_dir Path to the folder containing the document
force bool False Whether to overwrite the existing enhanced directory
Exported source
def fix_doc_hdgs(
    src_dir, # Path to the folder containing the document
    force=False, # Whether to overwrite the existing enhanced directory
    ):
    "Process the document directory"
    src_path = Path(src_dir)
    enhanced_path = src_path / cfg.enhanced_dir
    
    if enhanced_path.exists() and not force:
        print(f"Enhanced directory '{cfg.enhanced_dir}' already exists. Use force=True to overwrite.")
        return
    if enhanced_path.exists() and force: 
        shutil.rmtree(enhanced_path)
    
    enhanced_path = setup_enhanced_dir(src_dir)
    pages = enhanced_path.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1]))
    result = fix_md(get_hdgs_with_pages(pages))
    apply_all_corrections(result.results, enhanced_path)
print(doc)
fix_doc_hdgs(doc, force=True)
../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf

Enrich with figures description

# doc = src_dir / 'abridged_evaluation_report_final_olta_ndoja_pdf/enhanced'
doc = src_dir / 'final_evaluation_report_final_olta_ndoja_pdf/enhanced'
pages = doc.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1])); pages
(#142) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_20.md')...]

source

has_images

 has_images (page_path)
Exported source
def has_images(page_path):
    content = Path(page_path).read_text()
    return bool(re.search(r'!\[[^\]]*\]\([^)]+\)', content))

For instance:

[page for page in pages if has_images(page)]
[Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_1.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_11.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_12.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_14.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_15.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_16.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_21.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_22.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_23.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_29.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_30.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_38.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_59.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_60.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_63.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_68.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_84.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_95.md'),
 Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_114.md')]

source

MarkdownPage

 MarkdownPage (path)

A class to represent a markdown page

Exported source
class MarkdownPage: 
    "A class to represent a markdown page"
    def __init__(self, path): self.path = Path(path)

source

ImgRef

A class to represent a image reference

Exported source
class ImgRef(AttrDict):
    "A class to represent a image reference"
    def __repr__(self):
        clean_context = self.context.replace('\n', ' ')[:50] + "..."
        fields = [f"filename='{self.filename}'", f"context='{clean_context}'"]
        if hasattr(self, 'is_relevant'): fields.append(f"is_relevant={self.is_relevant}")
        if hasattr(self, 'reason'): fields.append(f"reason={self.reason}")
        # ... add other fields if present
        return f"ImgRef({', '.join(fields)})"

source

MarkdownPage.find_img_refs

 MarkdownPage.find_img_refs (context_lines:int=3)

Find all image references in the markdown page and include the context around the image

Type Default Details
context_lines int 3 Number of lines of context to include around the image
Exported source
@patch
def find_img_refs(
    self:MarkdownPage, # Markdown page of interest
    context_lines: int = 3, # Number of lines of context to include around the image
    ):
    "Find all image references in the markdown page and include the context around the image"
    content = self.path.read_text()
    lines = content.splitlines()
    results = []
    
    for i, line in enumerate(lines):
        if re.search(r'!\[[^\]]*\]\(([^)]+)\)', line):
            # Extract context around this line
            start = max(0, i - context_lines)
            end = min(len(lines), i + context_lines + 1)
            context = '\n'.join(lines[start:end])
            
            # Extract image filename
            match = re.search(r'!\[[^\]]*\]\(([^)]+)\)', line)
            results.append(ImgRef({
                "filename": match.group(1),
                "context": context
            }))
    
    return results

For instance:

for page in pages: 
    img_refs = MarkdownPage(page).find_img_refs()
    if img_refs: print(f"In {page.stem}: {img_refs}")

source

ImageRelevance

 ImageRelevance (img_filename:str, surrounding_context:str,
                 is_relevant:bool, reason:str)

*Determine if an image contains substantive content for document understanding.

RELEVANT: Charts, graphs, diagrams, figures, tables, screenshots, flowcharts IRRELEVANT: Logos, cover images, decorative elements, headers, footers*

Exported source
class ImageRelevance(dspy.Signature):
    """Determine if an image contains substantive content for document understanding.
    
    RELEVANT: Charts, graphs, diagrams, figures, tables, screenshots, flowcharts
    IRRELEVANT: Logos, cover images, decorative elements, headers, footers
    """
    img_filename: str = dspy.InputField()
    surrounding_context: str = dspy.InputField(desc="Text context around the image")
    is_relevant: bool = dspy.OutputField(desc="True only for substantive content like data visualizations")
    reason: str = dspy.OutputField(desc="Brief explanation of decision")

source

MarkdownPage.classify_imgs

 MarkdownPage.classify_imgs (img_refs:list[__main__.ImgRef])

Classify images in the markdown page

Type Details
img_refs list List of image references
Exported source
@patch
def classify_imgs(
    self:MarkdownPage, # Markdown page of interest
    img_refs: list[ImgRef], # List of image references
    ):
    "Classify images in the markdown page"
    classifier = dspy.ChainOfThought(ImageRelevance)
    for img_ref in img_refs:
        result = classifier(
            img_filename=img_ref.filename,
            surrounding_context=img_ref.context,
            page_nb=1  # We could make this dynamic if needed
        )
        img_ref.is_relevant = result.is_relevant
        img_ref.reason = result.reason
    return img_refs

For instance:

img_refs = MarkdownPage(pages[0]).find_img_refs(); print(img_refs)
md_page = MarkdownPage(pages[5]) 
img_refs = md_page.find_img_refs()
clf_img_refs = md_page.classify_imgs(img_refs)
print(clf_img_refs)
[ImgRef(filename='img-0.jpeg', context=' Final Evaluation Report, 17 March 2023  ![img-0.j...')]
[]

source

describe_img

 describe_img (img_path:pathlib.Path, context:str, api_key:str=None,
               model:str='gemini/gemini-2.0-flash')

Describe an image using an LLM

Type Default Details
img_path Path Path to the image
context str Context of the image
api_key str None API key for the LLM model
model str gemini/gemini-2.0-flash Model to use
Exported source
def describe_img(
    img_path: Path, # Path to the image
    context: str, # Context of the image
    api_key: str = cfg.api_key, # API key for the LLM model
    model: str = cfg.lm, # Model to use
    ):
    "Describe an image using an LLM"
    with open(img_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    # Auto-detect image format
    img_format = img_path.suffix.lower().replace('.', '')
    if img_format == 'jpg': img_format = 'jpeg'
    
    prompt = f"""Provide a concise paragraph description of this image for evaluation report analysis. Include: type of content, main topic, key data/statistics, trends, and takeaways. Write as flowing text, not numbered points. Context: {context}"""
    response = completion(
        model=model,
        messages=[{
            "role": "user", 
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/{img_format};base64,{base64_image}"}}
            ]
        }],
        api_key=api_key
    )
    return response.choices[0].message.content

source

MarkdownPage.describe_imgs

 MarkdownPage.describe_imgs (img_refs:list[__main__.ImgRef], img_dir:str)

Describe images in the markdown page

Type Details
img_refs list List of image references
img_dir str Image directory
Exported source
@patch
def describe_imgs(
    self:MarkdownPage, # Markdown page of interest
    img_refs: list[ImgRef], # List of image references
    img_dir: str # Image directory
    ):
    "Describe images in the markdown page"
    for img_ref in img_refs:
        if img_ref.is_relevant:
            img_path = Path(img_dir) / img_ref.filename
            description = describe_img(img_path, img_ref.context, GEMINI_API_KEY)
            img_ref.description = description
    return img_refs

For instance:

# md_page = MarkdownPage(pages[5]) 
# img_refs = md_page.find_img_refs()
# clf_img_refs = md_page.classify_imgs(img_refs)
# img_refs_desc = md_page.describe_imgs(clf_img_refs, doc.parent / 'img')
# print(img_refs_desc[0].description)

source

MarkdownPage.replace_imgs_with_desc

 MarkdownPage.replace_imgs_with_desc (img_refs,
                                      enriched_dir:str='enriched')

Replace images with their descriptions in the markdown page

Type Default Details
img_refs List of image references
enriched_dir str enriched Enriched directory
Exported source
@patch
def replace_imgs_with_desc(
    self:MarkdownPage, # Markdown page of interest
    img_refs, # List of image references
    enriched_dir: str = cfg.enriched_dir, # Enriched directory
    ):
    "Replace images with their descriptions in the markdown page"
    enriched_path = self.path.parent.parent / enriched_dir
    enriched_path.mkdir(exist_ok=True)
    
    content = self.path.read_text()
    for img_ref in img_refs:
        if img_ref.is_relevant and hasattr(img_ref, 'description'):
            pattern = f'!\\[[^\\]]*\\]\\({re.escape(img_ref.filename)}\\)'
            content = re.sub(pattern, img_ref.description, content)
    
    enriched_file = enriched_path / self.path.name
    enriched_file.write_text(content)
    return enriched_file

source

copy_page_to_enriched

 copy_page_to_enriched (page, enriched_dir:str='enriched')

Copy a page to the enriched directory

Type Default Details
page Page to copy
enriched_dir str enriched Enriched directory
Exported source
def copy_page_to_enriched(
    page, # Page to copy
    enriched_dir: str = cfg.enriched_dir, # Enriched directory
    ):
    "Copy a page to the enriched directory"
    enriched_path = page.parent.parent / enriched_dir
    enriched_path.mkdir(exist_ok=True)
    return shutil.copy(page, enriched_path)

source

process_single_page

 process_single_page (page, img_dir, enriched_dir:str='enriched')

Process a single page

Type Default Details
page Page to process
img_dir Image directory
enriched_dir str enriched Enriched directory
Exported source
def process_single_page(
    page, # Page to process
    img_dir, # Image directory
    enriched_dir: str = cfg.enriched_dir, # Enriched directory
    ):
    "Process a single page"
    md_page = MarkdownPage(page)
    # Pipeline: find → classify → describe → replace
    img_refs = md_page.find_img_refs()
    
    if not img_refs: return copy_page_to_enriched(page, enriched_dir)
    
    classified_refs = md_page.classify_imgs(img_refs)
    time.sleep(0.5)
    described_refs = md_page.describe_imgs(classified_refs, img_dir)
    time.sleep(0.5)
    return md_page.replace_imgs_with_desc(described_refs)

source

enrich_images

 enrich_images (pages_dir, img_dir, n_workers=2)

Enrich images in the pages directory

Type Default Details
pages_dir Pages directory
img_dir Image directory
n_workers int 2 Number of workers
Exported source
def enrich_images(
    pages_dir, # Pages directory
    img_dir, # Image directory
    n_workers=2, # Number of workers
    ):
    "Enrich images in the pages directory"
    pages = Path(pages_dir).ls(file_exts=".md")
    
    pages_with_imgs = []
    for page in pages:
        if has_images(page):
            pages_with_imgs.append(page)
        else:
            copy_page_to_enriched(page)
    
    if pages_with_imgs:
        process_fn = partial(process_single_page, img_dir=img_dir)
        parallel(process_fn, pages_with_imgs, n_workers=n_workers, threadpool=True, progress=True)
        
    print(f"✓ Processed {len(pages)} pages ({len(pages_with_imgs)} with images)")
enrich_images(doc, doc.parent / 'img', n_workers=1)

CLI


source

md_plus_evaluation

 md_plus_evaluation (eval_id:str, md_dir:str='../data/md_library',
                     overwrite:bool=False)

Fix markdown headings and enrich images for an evaluation report

Type Default Details
eval_id str Evaluation ID to process
md_dir str ../data/md_library Directory containing markdown folders
overwrite bool False Overwrite if enhanced/enriched already exists