Exported source
cfg = AttrDict({
'enhanced_dir': 'enhanced',
'enriched_dir': 'enriched',
'lm': 'gemini/gemini-2.0-flash',
'api_key': GEMINI_API_KEY,
'max_tokens': 8192,
'track_usage': False,
'img_dir': 'img'
})This module aims to fix and enrich markdown headings from OCR’d PDF files by:
(#142) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/page_20.md')...]
setup_enhanced_dir (src_dir, enhanced_dir_name='enhanced')
Create enhanced directory and copy all markdown files to it
| Type | Default | Details | |
|---|---|---|---|
| src_dir | Source directory path | ||
| enhanced_dir_name | str | enhanced | Name of enhanced subdirectory |
def setup_enhanced_dir(
src_dir, # Source directory path
enhanced_dir_name=cfg.enhanced_dir # Name of enhanced subdirectory
):
"Create enhanced directory and copy all markdown files to it"
src_path = Path(src_dir)
enhanced_path = src_path / enhanced_dir_name
enhanced_path.mkdir(exist_ok=True)
for f in src_path.ls(file_exts=".md"): shutil.copy(f, enhanced_path)
return enhanced_pathFor instance:
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced')
get_hdgs (md_txt)
get_hdgs_with_pages (pages:list[pathlib.Path])
Get headings and the page number they are on
| Type | Details | |
|---|---|---|
| pages | list | List of pages |
def get_hdgs_with_pages(
pages: list[Path] # List of pages
):
"Get headings and the page number they are on"
headings = []
for i, page in enumerate(pages, 1): # page numbers start at 1
page_headings = get_hdgs(page.read_text())
for o in page_headings: headings.append({'heading': o, 'page': i})
return headings[{'heading': '# **PPMi**', 'page': 1},
{'heading': '# LIST OF FIGURES ', 'page': 5},
{'heading': '# Abbreviations and terminology ', 'page': 6},
{'heading': '# Key terminology ', 'page': 8},
{'heading': '# Executive summary ', 'page': 10}]
(#5) ['# **PPMi**','# LIST OF FIGURES ','# Abbreviations and terminology ','# Key terminology ','# Executive summary ']
format_hdgs (hdgs:list[dict])
Format headings with page numbers
| Type | Details | |
|---|---|---|
| hdgs | list | List of headings with page numbers |
def format_hdgs(
hdgs: list[dict] # List of headings with page numbers
):
"Format headings with page numbers"
formatted = []
page_positions = {}
for item in hdgs:
page = item['page']
page_positions[page] = page_positions.get(page, 0) + 1
formatted.append(f"{item['heading']} (Page {page}, Position {page_positions[page]})")
return "\n".join(formatted)# **PPMi** (Page 1, Position 1) # LIST OF FIGURES (Page 5, Position 1) # Abbreviations and terminology (Page 6, Position 1) # Key terminology (Page 8, Position 1) # Executive summary (Page 10, Position 1) ## Background (Page 10, Position 2) # Methodology (Page 11, Position 1) # Findings (Page 12, Position 1) ## Relevance (Page 12, Position 2) # Coherence (Page 13, Position 1) ## $4.3 / 5$ (Page 13, Position 2) # Effectiveness (Page 14, Position 1) ## Specific Outcome 1: (Page 14, Positio
HeadingResult (old:str, page:int, position:int, new:str, changed:bool)
*!!! abstract “Usage Documentation” Models
A base class for creating Pydantic models.
Attributes: class_vars: The names of the class variables defined on the model. private_attributes: Metadata about the private attributes of the model. signature: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.*
FixHeadingHierarchy (headings_with_pages:str, results:List[__main__.HeadingResult])
Fix markdown heading hierarchy by analyzing the document’s numbering patterns: - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.) - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level - When a section number is lower than a previously seen number at the same level (e.g., seeing ‘2.’ after ‘3.1’), it’s likely a subsection or list item, not a main section - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections - Return ALL headings with their corrected form
class FixHeadingHierarchy(dspy.Signature):
"""Fix markdown heading hierarchy by analyzing the document's numbering patterns:
- Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.)
- Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level
- When a section number is lower than a previously seen number at the same level (e.g., seeing '2.' after '3.1'), it's likely a subsection or list item, not a main section
- Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections
- Return ALL headings with their corrected form
"""
headings_with_pages: str = dspy.InputField(desc="List of headings with page numbers")
results: List[HeadingResult] = dspy.OutputField(desc="All headings with corrections and change status")fix_md (hdgs:list[dict], track_usage:bool=False)
Fix markdown headings
| Type | Default | Details | |
|---|---|---|---|
| hdgs | list | List of headings with page numbers | |
| track_usage | bool | False |
def fix_md(
hdgs: list[dict], # List of headings with page numbers
track_usage: bool=cfg.track_usage,
):
"Fix markdown headings"
lm = dspy.LM(cfg.lm, api_key=cfg.api_key, max_tokens=cfg.max_tokens)
dspy.configure(lm=lm)
dspy.settings.configure(track_usage=track_usage)
inp = format_hdgs(hdgs)
fix_hdgs = dspy.ChainOfThought(FixHeadingHierarchy)
result = fix_hdgs(headings_with_pages=inp)
return resultgroup_corrections_by_page (results:list[__main__.HeadingResult])
Group HeadingResult corrections by page number into dict with page nums as keys
| Type | Details | |
|---|---|---|
| results | list | List of headings with corrections and change status |
def group_corrections_by_page(
results: list[HeadingResult], # List of headings with corrections and change status
):
"Group HeadingResult corrections by page number into dict with page nums as keys"
page_groups = {}
for result in results:
page = result.page
if page not in page_groups:
page_groups[page] = []
page_groups[page].append(result)
return page_groupsapply_corrections_to_page (page_nb, corrections, enhanced_path)
Apply corrections to a page in the enhanced directory
| Details | |
|---|---|
| page_nb | Page number |
| corrections | List of corrections |
| enhanced_path | Path to enhanced directory |
def apply_corrections_to_page(
page_nb, # Page number
corrections, # List of corrections
enhanced_path, # Path to enhanced directory
):
"Apply corrections to a page in the enhanced directory"
page_file = enhanced_path / f"page_{page_nb}.md"
lines = page_file.read_text().splitlines()
corrections_copy = corrections.copy()
for i, line in enumerate(lines):
for correction in corrections_copy:
if line.strip() == correction.old.strip():
lines[i] = f"{correction.new} .... page {page_nb}"
corrections_copy.remove(correction)
break
page_file.write_text('\n'.join(lines))apply_all_corrections (results, enhanced_path)
Apply all corrections to the pages in enhanced directory
| Details | |
|---|---|
| results | List of headings with corrections and change status |
| enhanced_path | Path to enhanced directory |
def apply_all_corrections(
results, # List of headings with corrections and change status
enhanced_path, # Path to enhanced directory
):
"Apply all corrections to the pages in enhanced directory"
grouped = group_corrections_by_page(results)
for page_nb, corrections in grouped.items():
apply_corrections_to_page(page_nb, corrections, enhanced_path)fix_doc_hdgs (src_dir, force=False)
Process the document directory
| Type | Default | Details | |
|---|---|---|---|
| src_dir | Path to the folder containing the document | ||
| force | bool | False | Whether to overwrite the existing enhanced directory |
def fix_doc_hdgs(
src_dir, # Path to the folder containing the document
force=False, # Whether to overwrite the existing enhanced directory
):
"Process the document directory"
src_path = Path(src_dir)
enhanced_path = src_path / cfg.enhanced_dir
if enhanced_path.exists() and not force:
print(f"Enhanced directory '{cfg.enhanced_dir}' already exists. Use force=True to overwrite.")
return
if enhanced_path.exists() and force:
shutil.rmtree(enhanced_path)
enhanced_path = setup_enhanced_dir(src_dir)
pages = enhanced_path.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1]))
result = fix_md(get_hdgs_with_pages(pages))
apply_all_corrections(result.results, enhanced_path)(#142) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_20.md')...]
has_images (page_path)
For instance:
[Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_1.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_11.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_12.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_14.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_15.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_16.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_21.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_22.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_23.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_29.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_30.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_38.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_59.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_60.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_63.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_68.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_84.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_95.md'),
Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/final_evaluation_report_final_olta_ndoja_pdf/enhanced/page_114.md')]
MarkdownPage (path)
A class to represent a markdown page
A class to represent a image reference
class ImgRef(AttrDict):
"A class to represent a image reference"
def __repr__(self):
clean_context = self.context.replace('\n', ' ')[:50] + "..."
fields = [f"filename='{self.filename}'", f"context='{clean_context}'"]
if hasattr(self, 'is_relevant'): fields.append(f"is_relevant={self.is_relevant}")
if hasattr(self, 'reason'): fields.append(f"reason={self.reason}")
# ... add other fields if present
return f"ImgRef({', '.join(fields)})"MarkdownPage.find_img_refs (context_lines:int=3)
Find all image references in the markdown page and include the context around the image
| Type | Default | Details | |
|---|---|---|---|
| context_lines | int | 3 | Number of lines of context to include around the image |
@patch
def find_img_refs(
self:MarkdownPage, # Markdown page of interest
context_lines: int = 3, # Number of lines of context to include around the image
):
"Find all image references in the markdown page and include the context around the image"
content = self.path.read_text()
lines = content.splitlines()
results = []
for i, line in enumerate(lines):
if re.search(r'!\[[^\]]*\]\(([^)]+)\)', line):
# Extract context around this line
start = max(0, i - context_lines)
end = min(len(lines), i + context_lines + 1)
context = '\n'.join(lines[start:end])
# Extract image filename
match = re.search(r'!\[[^\]]*\]\(([^)]+)\)', line)
results.append(ImgRef({
"filename": match.group(1),
"context": context
}))
return resultsFor instance:
ImageRelevance (img_filename:str, surrounding_context:str, is_relevant:bool, reason:str)
*Determine if an image contains substantive content for document understanding.
RELEVANT: Charts, graphs, diagrams, figures, tables, screenshots, flowcharts IRRELEVANT: Logos, cover images, decorative elements, headers, footers*
class ImageRelevance(dspy.Signature):
"""Determine if an image contains substantive content for document understanding.
RELEVANT: Charts, graphs, diagrams, figures, tables, screenshots, flowcharts
IRRELEVANT: Logos, cover images, decorative elements, headers, footers
"""
img_filename: str = dspy.InputField()
surrounding_context: str = dspy.InputField(desc="Text context around the image")
is_relevant: bool = dspy.OutputField(desc="True only for substantive content like data visualizations")
reason: str = dspy.OutputField(desc="Brief explanation of decision")MarkdownPage.classify_imgs (img_refs:list[__main__.ImgRef])
Classify images in the markdown page
| Type | Details | |
|---|---|---|
| img_refs | list | List of image references |
@patch
def classify_imgs(
self:MarkdownPage, # Markdown page of interest
img_refs: list[ImgRef], # List of image references
):
"Classify images in the markdown page"
classifier = dspy.ChainOfThought(ImageRelevance)
for img_ref in img_refs:
result = classifier(
img_filename=img_ref.filename,
surrounding_context=img_ref.context,
page_nb=1 # We could make this dynamic if needed
)
img_ref.is_relevant = result.is_relevant
img_ref.reason = result.reason
return img_refsFor instance:
[ImgRef(filename='img-0.jpeg', context=' Final Evaluation Report, 17 March 2023 ![img-0.j...')]
[]
describe_img (img_path:pathlib.Path, context:str, api_key:str=None, model:str='gemini/gemini-2.0-flash')
Describe an image using an LLM
| Type | Default | Details | |
|---|---|---|---|
| img_path | Path | Path to the image | |
| context | str | Context of the image | |
| api_key | str | None | API key for the LLM model |
| model | str | gemini/gemini-2.0-flash | Model to use |
def describe_img(
img_path: Path, # Path to the image
context: str, # Context of the image
api_key: str = cfg.api_key, # API key for the LLM model
model: str = cfg.lm, # Model to use
):
"Describe an image using an LLM"
with open(img_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
# Auto-detect image format
img_format = img_path.suffix.lower().replace('.', '')
if img_format == 'jpg': img_format = 'jpeg'
prompt = f"""Provide a concise paragraph description of this image for evaluation report analysis. Include: type of content, main topic, key data/statistics, trends, and takeaways. Write as flowing text, not numbered points. Context: {context}"""
response = completion(
model=model,
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/{img_format};base64,{base64_image}"}}
]
}],
api_key=api_key
)
return response.choices[0].message.contentMarkdownPage.describe_imgs (img_refs:list[__main__.ImgRef], img_dir:str)
Describe images in the markdown page
| Type | Details | |
|---|---|---|
| img_refs | list | List of image references |
| img_dir | str | Image directory |
@patch
def describe_imgs(
self:MarkdownPage, # Markdown page of interest
img_refs: list[ImgRef], # List of image references
img_dir: str # Image directory
):
"Describe images in the markdown page"
for img_ref in img_refs:
if img_ref.is_relevant:
img_path = Path(img_dir) / img_ref.filename
description = describe_img(img_path, img_ref.context, GEMINI_API_KEY)
img_ref.description = description
return img_refsFor instance:
MarkdownPage.replace_imgs_with_desc (img_refs, enriched_dir:str='enriched')
Replace images with their descriptions in the markdown page
| Type | Default | Details | |
|---|---|---|---|
| img_refs | List of image references | ||
| enriched_dir | str | enriched | Enriched directory |
@patch
def replace_imgs_with_desc(
self:MarkdownPage, # Markdown page of interest
img_refs, # List of image references
enriched_dir: str = cfg.enriched_dir, # Enriched directory
):
"Replace images with their descriptions in the markdown page"
enriched_path = self.path.parent.parent / enriched_dir
enriched_path.mkdir(exist_ok=True)
content = self.path.read_text()
for img_ref in img_refs:
if img_ref.is_relevant and hasattr(img_ref, 'description'):
pattern = f'!\\[[^\\]]*\\]\\({re.escape(img_ref.filename)}\\)'
content = re.sub(pattern, img_ref.description, content)
enriched_file = enriched_path / self.path.name
enriched_file.write_text(content)
return enriched_filecopy_page_to_enriched (page, enriched_dir:str='enriched')
Copy a page to the enriched directory
| Type | Default | Details | |
|---|---|---|---|
| page | Page to copy | ||
| enriched_dir | str | enriched | Enriched directory |
process_single_page (page, img_dir, enriched_dir:str='enriched')
Process a single page
| Type | Default | Details | |
|---|---|---|---|
| page | Page to process | ||
| img_dir | Image directory | ||
| enriched_dir | str | enriched | Enriched directory |
def process_single_page(
page, # Page to process
img_dir, # Image directory
enriched_dir: str = cfg.enriched_dir, # Enriched directory
):
"Process a single page"
md_page = MarkdownPage(page)
# Pipeline: find → classify → describe → replace
img_refs = md_page.find_img_refs()
if not img_refs: return copy_page_to_enriched(page, enriched_dir)
classified_refs = md_page.classify_imgs(img_refs)
time.sleep(0.5)
described_refs = md_page.describe_imgs(classified_refs, img_dir)
time.sleep(0.5)
return md_page.replace_imgs_with_desc(described_refs)enrich_images (pages_dir, img_dir, n_workers=2)
Enrich images in the pages directory
| Type | Default | Details | |
|---|---|---|---|
| pages_dir | Pages directory | ||
| img_dir | Image directory | ||
| n_workers | int | 2 | Number of workers |
def enrich_images(
pages_dir, # Pages directory
img_dir, # Image directory
n_workers=2, # Number of workers
):
"Enrich images in the pages directory"
pages = Path(pages_dir).ls(file_exts=".md")
pages_with_imgs = []
for page in pages:
if has_images(page):
pages_with_imgs.append(page)
else:
copy_page_to_enriched(page)
if pages_with_imgs:
process_fn = partial(process_single_page, img_dir=img_dir)
parallel(process_fn, pages_with_imgs, n_workers=n_workers, threadpool=True, progress=True)
print(f"✓ Processed {len(pages)} pages ({len(pages_with_imgs)} with images)")md_plus_evaluation (eval_id:str, md_dir:str='../data/md_library', overwrite:bool=False)
Fix markdown headings and enrich images for an evaluation report
| Type | Default | Details | |
|---|---|---|---|
| eval_id | str | Evaluation ID to process | |
| md_dir | str | ../data/md_library | Directory containing markdown folders |
| overwrite | bool | False | Overwrite if enhanced/enriched already exists |