Exported source
= AttrDict({
cfg 'fixed_suffix': '_fixed',
'lm': 'gemini/gemini-2.0-flash-exp',
'api_key': GEMINI_API_KEY,
'max_tokens': 8192,
'track_usage': False
})
This module aims to fix and enrich markdown headings from OCR’d PDF files by:
doc = src_dir / 'abridged_evaluation_report_final_olta_ndoja_pdf'
pages = [p for p in doc.ls(file_exts=".md") if cfg.fixed_suffix not in p.stem]
pages = L(pages).sorted(key=lambda p: int(p.stem.split('_')[1])); pages
(#31) [Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_1.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_2.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_3.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_4.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_5.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_6.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_7.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_8.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_9.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_10.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_11.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_12.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_13.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_14.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_15.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_16.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_17.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_18.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_19.md'),Path('../_data/md_library/49d2fba781b6a7c0d94577479636ee6f/abridged_evaluation_report_final_olta_ndoja_pdf/page_20.md')...]
get_hdgs (md_txt)
get_hdgs_with_pages (pages:list[pathlib.Path])
Get headings and the page number they are on
Type | Details | |
---|---|---|
pages | list | List of pages |
def get_hdgs_with_pages(
pages: list[Path] # List of pages
):
"Get headings and the page number they are on"
headings = []
for i, page in enumerate(pages, 1): # page numbers start at 1
page_headings = get_hdgs(page.read_text())
# add each heading with its page number
for o in page_headings:
headings.append({'heading': o, 'page': i})
return headings
[{'heading': '# **PPMi**', 'page': 1},
{'heading': '# CONTENTS ', 'page': 3},
{'heading': '# 1. Introduction ', 'page': 4},
{'heading': '# 2. Background of the JI-HoA ', 'page': 5},
{'heading': '### 2.1. Context and design of the JI-HoA', 'page': 5}]
(#5) ['# **PPMi**','# CONTENTS ','# 1. Introduction ','# 2. Background of the JI-HoA ','### 2.1. Context and design of the JI-HoA']
format_hdgs (hdgs:list[dict])
Format headings with page numbers
Type | Details | |
---|---|---|
hdgs | list | List of headings with page numbers |
def format_hdgs(
hdgs: list[dict] # List of headings with page numbers
):
"Format headings with page numbers"
formatted = []
page_positions = {}
for item in hdgs:
page = item['page']
page_positions[page] = page_positions.get(page, 0) + 1
formatted.append(f"{item['heading']} (Page {page}, Position {page_positions[page]})")
return "\n".join(formatted)
# **PPMi** (Page 1, Position 1)
# CONTENTS (Page 3, Position 1)
# 1. Introduction (Page 4, Position 1)
# 2. Background of the JI-HoA (Page 5, Position 1)
### 2.1. Context and design of the JI-HoA (Page 5, Position 2)
# 2.2. External factors affecting the implementation of the JI (Page 7, Position 1)
# 3. Methodology (Page 8, Position 1)
# 4. Findings (Page 10, Position 1)
### 4.1. Relevance (Page 10, Position 2)
### 4.1.1. Relevance of programme activities for migrants, returnees, and comm
HeadingResult (old:str, page:int, position:int, new:str, changed:bool)
*!!! abstract “Usage Documentation” Models
A base class for creating Pydantic models.
Attributes: class_vars: The names of the class variables defined on the model. private_attributes: Metadata about the private attributes of the model. signature: The synthesized __init__
[Signature
][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.*
FixHeadingHierarchy (headings_with_pages:str, results:List[__main__.HeadingResult])
Fix markdown heading hierarchy by analyzing the document’s numbering patterns: - Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.) - Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level - When a section number is lower than a previously seen number at the same level (e.g., seeing ‘2.’ after ‘3.1’), it’s likely a subsection or list item, not a main section - Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections - Return ALL headings with their corrected form
class FixHeadingHierarchy(dspy.Signature):
"""Fix markdown heading hierarchy by analyzing the document's numbering patterns:
- Detect numbering scheme (1.2.3, I.A.1, A.1.a, etc.)
- Apply hierarchy levels based on nesting depth: # for top level, ## for second level, ### for third level
- When a section number is lower than a previously seen number at the same level (e.g., seeing '2.' after '3.1'), it's likely a subsection or list item, not a main section
- Unnumbered headings: keep as-is if at document boundaries, treat as subsections if within numbered sections
- Return ALL headings with their corrected form
"""
headings_with_pages: str = dspy.InputField(desc="List of headings with page numbers")
results: List[HeadingResult] = dspy.OutputField(desc="All headings with corrections and change status")
fix_md (hdgs:list[dict], track_usage:bool=False)
Fix markdown headings
Type | Default | Details | |
---|---|---|---|
hdgs | list | List of headings with page numbers | |
track_usage | bool | False |
def fix_md(
hdgs: list[dict], # List of headings with page numbers
track_usage: bool=cfg.track_usage,
):
"Fix markdown headings"
lm = dspy.LM(cfg.lm, api_key=cfg.api_key, max_tokens=cfg.max_tokens)
dspy.configure(lm=lm)
dspy.settings.configure(track_usage=track_usage)
inp = format_hdgs(hdgs)
fix_hdgs = dspy.ChainOfThought(FixHeadingHierarchy)
result = fix_hdgs(headings_with_pages=inp)
return result
result = fix_md(hdgs, track_usage=True)
print("Result:", result)
print("Usage:", result.get_lm_usage())
Result: Prediction(
reasoning='The provided headings have inconsistent hierarchy. I will correct the hierarchy based on the numbering and content. The numbering scheme appears to be a mix of numbered sections (1, 2, 3, etc.) and subsections (2.1, 2.1.1, etc.). I will use this numbering to determine the appropriate heading level. Unnumbered headings within numbered sections will be treated as subsections. Headings that appear to be list items (e.g., "3. Increase attention...") will be treated as top-level headings.',
results=[HeadingResult(old='# **PPMi**', page=1, position=1, new='# **PPMi**', changed=False), HeadingResult(old='# CONTENTS', page=3, position=1, new='# CONTENTS', changed=False), HeadingResult(old='# 1. Introduction', page=4, position=1, new='# 1. Introduction', changed=False), HeadingResult(old='# 2. Background of the JI-HoA', page=5, position=1, new='# 2. Background of the JI-HoA', changed=False), HeadingResult(old='### 2.1. Context and design of the JI-HoA', page=5, position=2, new='## 2.1. Context and design of the JI-HoA', changed=False), HeadingResult(old='# 2.2. External factors affecting the implementation of the JI', page=7, position=1, new='## 2.2. External factors affecting the implementation of the JI', changed=True), HeadingResult(old='# 3. Methodology', page=8, position=1, new='# 3. Methodology', changed=False), HeadingResult(old='# 4. Findings', page=10, position=1, new='# 4. Findings', changed=False), HeadingResult(old='### 4.1. Relevance', page=10, position=2, new='## 4.1. Relevance', changed=False), HeadingResult(old='### 4.1.1. Relevance of programme activities for migrants, returnees, and communities', page=10, position=3, new='### 4.1.1. Relevance of programme activities for migrants, returnees, and communities', changed=False), HeadingResult(old='## Overall performance score for relevance: $3.9 / 5$ <br> Robustness score for the evidence: $4.5 / 5$', page=10, position=4, new='#### Overall performance score for relevance: $3.9 / 5$ <br> Robustness score for the evidence: $4.5 / 5$', changed=True), HeadingResult(old='### 4.1.1.1 Needs of migrants', page=10, position=5, new='#### 4.1.1.1 Needs of migrants', changed=False), HeadingResult(old='### 4.1.1.2 Needs of returnees', page=10, position=6, new='#### 4.1.1.2 Needs of returnees', changed=False), HeadingResult(old='# 4.1.1.3 Needs of community members', page=12, position=1, new='#### 4.1.1.3 Needs of community members', changed=True), HeadingResult(old="### 4.1.2. Programme's relevance to the needs of stakeholders", page=12, position=2, new="### 4.1.2. Programme's relevance to the needs of stakeholders", changed=False), HeadingResult(old='### 4.1.2.1 Needs of governments', page=12, position=3, new='#### 4.1.2.1 Needs of governments', changed=False), HeadingResult(old='# 4.1.2.2 Needs of other stakeholders', page=13, position=1, new='#### 4.1.2.2 Needs of other stakeholders', changed=True), HeadingResult(old='### 4.2. Coherence', page=13, position=2, new='## 4.2. Coherence', changed=False), HeadingResult(old="# 4.2.1. The JI-HoA's alignment with the objectives and standards of IOM, and objectives of the EU", page=14, position=1, new="### 4.2.1. The JI-HoA's alignment with the objectives and standards of IOM, and objectives of the EU", changed=True), HeadingResult(old='### 4.2.2. Alignment with other initiatives', page=14, position=2, new='### 4.2.2. Alignment with other initiatives', changed=False), HeadingResult(old='# 4.3. Effectiveness', page=16, position=1, new='## 4.3. Effectiveness', changed=True), HeadingResult(old='### 4.3.1. Specific Objective 1: Partner countries and relevant stakeholders developed or strengthened evidence-based return and reintegration procedures', page=16, position=2, new='### 4.3.1. Specific Objective 1: Partner countries and relevant stakeholders developed or strengthened evidence-based return and reintegration procedures', changed=False), HeadingResult(old='### 4.3.1.1 Achievement of outputs and results', page=16, position=3, new='#### 4.3.1.1 Achievement of outputs and results', changed=False), HeadingResult(old='## Data availability', page=16, position=4, new='##### Data availability', changed=True), HeadingResult(old='# Capacity of stakeholders', page=17, position=1, new='##### Capacity of stakeholders', changed=True), HeadingResult(old='### 4.3.1.2 Achievement of Specific Objective 1', page=17, position=2, new='#### 4.3.1.2 Achievement of Specific Objective 1', changed=False), HeadingResult(old='# 4.3.2. Specific Objective 2: Safe, humane, dignified voluntary return processes are enhanced along main migration routes', page=18, position=1, new='### 4.3.2. Specific Objective 2: Safe, humane, dignified voluntary return processes are enhanced along main migration routes', changed=True), HeadingResult(old='# 4.3.2.1 Achievement of outputs and results', page=19, position=1, new='#### 4.3.2.1 Achievement of outputs and results', changed=True), HeadingResult(old='## Outreach and awareness', page=19, position=2, new='##### Outreach and awareness', changed=True), HeadingResult(old='## Assistance to stranded migrants', page=19, position=3, new='##### Assistance to stranded migrants', changed=True), HeadingResult(old='# 4.3.2.2 Achievement of the Objective', page=20, position=1, new='#### 4.3.2.2 Achievement of the Objective', changed=True), HeadingResult(old='### 4.3.3. Specific Objective 3: Returnees are sustainably integrated in host communities, and host communities are better able to create living standards that address drivers of migration.', page=20, position=2, new='### 4.3.3. Specific Objective 3: Returnees are sustainably integrated in host communities, and host communities are better able to create living standards that address drivers of migration.', changed=False), HeadingResult(old='### 4.3.3.1 Achievement of outputs and results', page=20, position=3, new='#### 4.3.3.1 Achievement of outputs and results', changed=False), HeadingResult(old='## Individual and community-based reintegration', page=20, position=4, new='##### Individual and community-based reintegration', changed=True), HeadingResult(old='# M\\&E systems', page=21, position=1, new='##### M\\&E systems', changed=True), HeadingResult(old='# 4.3.3.2 Achievement of Specific Objective 3', page=22, position=1, new='#### 4.3.3.2 Achievement of Specific Objective 3', changed=True), HeadingResult(old='## Overall achievement of reintegration', page=22, position=2, new='##### Overall achievement of reintegration', changed=True), HeadingResult(old='# Sustainability of reintegration', page=23, position=1, new='##### Sustainability of reintegration', changed=True), HeadingResult(old='### 4.3.4. Functioning of the Integrated Approach', page=23, position=2, new='### 4.3.4. Functioning of the Integrated Approach', changed=False), HeadingResult(old='# 4.4. Efficiency', page=24, position=1, new='## 4.4. Efficiency', changed=True), HeadingResult(old='### 4.4.3. Did the programme receive sufficient resources to achieve its objectives?', page=24, position=2, new='### 4.4.3. Did the programme receive sufficient resources to achieve its objectives?', changed=False), HeadingResult(old='# 4.4.2. Cost-effectiveness and efficiency of the programme', page=25, position=1, new='### 4.4.2. Cost-effectiveness and efficiency of the programme', changed=True), HeadingResult(old='# 4.5. Sustainability', page=26, position=1, new='## 4.5. Sustainability', changed=True), HeadingResult(old='## Overall performance score for sustainability: $2.5 / 5$ <br> Robustness score for the evidence: $4 / 5$', page=26, position=2, new='#### Overall performance score for sustainability: $2.5 / 5$ <br> Robustness score for the evidence: $4 / 5$', changed=True), HeadingResult(old='# 5. Conclusions and Recommendations', page=27, position=1, new='# 5. Conclusions and Recommendations', changed=False), HeadingResult(old='### 5.1. Conclusions', page=27, position=2, new='## 5.1. Conclusions', changed=False), HeadingResult(old='# 5.2. Recommendations', page=28, position=1, new='## 5.2. Recommendations', changed=True), HeadingResult(old='# 3. Increase attention on building partnerships with service providers who can function without (significant) funding channelled by IOM.', page=29, position=1, new='# 3. Increase attention on building partnerships with service providers who can function without (significant) funding channelled by IOM.', changed=True), HeadingResult(old='# 5. Explore opportunities to extend the scope of support provided to returnees, with a focus on longer-term integration.', page=30, position=1, new='# 5. Explore opportunities to extend the scope of support provided to returnees, with a focus on longer-term integration.', changed=True)]
)
Usage: {}
group_corrections_by_page (results:list[__main__.HeadingResult])
Group HeadingResult corrections by page number into dict with page nums as keys
Type | Details | |
---|---|---|
results | list | List of headings with corrections and change status |
def group_corrections_by_page(
results: list[HeadingResult], # List of headings with corrections and change status
):
"Group HeadingResult corrections by page number into dict with page nums as keys"
page_groups = {}
for result in results:
page = result.page
if page not in page_groups:
page_groups[page] = []
page_groups[page].append(result)
return page_groups
{1: [HeadingResult(old='# **PPMi**', page=1, position=1, new='# **PPMi**', changed=False)],
3: [HeadingResult(old='# CONTENTS', page=3, position=1, new='# CONTENTS', changed=False)],
4: [HeadingResult(old='# 1. Introduction', page=4, position=1, new='# 1. Introduction', changed=False)],
5: [HeadingResult(old='# 2. Background of the JI-HoA', page=5, position=1, new='# 2. Background of the JI-HoA', changed=False),
HeadingResult(old='### 2.1. Context and design of the JI-HoA', page=5, position=2, new='## 2.1. Context and design of the JI-HoA', changed=False)],
7: [HeadingResult(old='# 2.2. External factors affecting the implementation of the JI', page=7, position=1, new='## 2.2. External factors affecting the implementation of the JI', changed=True)],
8: [HeadingResult(old='# 3. Methodology', page=8, position=1, new='# 3. Methodology', changed=False)],
10: [HeadingResult(old='# 4. Findings', page=10, position=1, new='# 4. Findings', changed=False),
HeadingResult(old='### 4.1. Relevance', page=10, position=2, new='## 4.1. Relevance', changed=False),
HeadingResult(old='### 4.1.1. Relevance of programme activities for migrants, returnees, and communities', page=10, position=3, new='### 4.1.1. Relevance of programme activities for migrants, returnees, and communities', changed=False),
HeadingResult(old='## Overall performance score for relevance: $3.9 / 5$ <br> Robustness score for the evidence: $4.5 / 5$', page=10, position=4, new='#### Overall performance score for relevance: $3.9 / 5$ <br> Robustness score for the evidence: $4.5 / 5$', changed=True),
HeadingResult(old='### 4.1.1.1 Needs of migrants', page=10, position=5, new='#### 4.1.1.1 Needs of migrants', changed=False),
HeadingResult(old='### 4.1.1.2 Needs of returnees', page=10, position=6, new='#### 4.1.1.2 Needs of returnees', changed=False)],
12: [HeadingResult(old='# 4.1.1.3 Needs of community members', page=12, position=1, new='#### 4.1.1.3 Needs of community members', changed=True),
HeadingResult(old="### 4.1.2. Programme's relevance to the needs of stakeholders", page=12, position=2, new="### 4.1.2. Programme's relevance to the needs of stakeholders", changed=False),
HeadingResult(old='### 4.1.2.1 Needs of governments', page=12, position=3, new='#### 4.1.2.1 Needs of governments', changed=False)],
13: [HeadingResult(old='# 4.1.2.2 Needs of other stakeholders', page=13, position=1, new='#### 4.1.2.2 Needs of other stakeholders', changed=True),
HeadingResult(old='### 4.2. Coherence', page=13, position=2, new='## 4.2. Coherence', changed=False)],
14: [HeadingResult(old="# 4.2.1. The JI-HoA's alignment with the objectives and standards of IOM, and objectives of the EU", page=14, position=1, new="### 4.2.1. The JI-HoA's alignment with the objectives and standards of IOM, and objectives of the EU", changed=True),
HeadingResult(old='### 4.2.2. Alignment with other initiatives', page=14, position=2, new='### 4.2.2. Alignment with other initiatives', changed=False)],
16: [HeadingResult(old='# 4.3. Effectiveness', page=16, position=1, new='## 4.3. Effectiveness', changed=True),
HeadingResult(old='### 4.3.1. Specific Objective 1: Partner countries and relevant stakeholders developed or strengthened evidence-based return and reintegration procedures', page=16, position=2, new='### 4.3.1. Specific Objective 1: Partner countries and relevant stakeholders developed or strengthened evidence-based return and reintegration procedures', changed=False),
HeadingResult(old='### 4.3.1.1 Achievement of outputs and results', page=16, position=3, new='#### 4.3.1.1 Achievement of outputs and results', changed=False),
HeadingResult(old='## Data availability', page=16, position=4, new='##### Data availability', changed=True)],
17: [HeadingResult(old='# Capacity of stakeholders', page=17, position=1, new='##### Capacity of stakeholders', changed=True),
HeadingResult(old='### 4.3.1.2 Achievement of Specific Objective 1', page=17, position=2, new='#### 4.3.1.2 Achievement of Specific Objective 1', changed=False)],
18: [HeadingResult(old='# 4.3.2. Specific Objective 2: Safe, humane, dignified voluntary return processes are enhanced along main migration routes', page=18, position=1, new='### 4.3.2. Specific Objective 2: Safe, humane, dignified voluntary return processes are enhanced along main migration routes', changed=True)],
19: [HeadingResult(old='# 4.3.2.1 Achievement of outputs and results', page=19, position=1, new='#### 4.3.2.1 Achievement of outputs and results', changed=True),
HeadingResult(old='## Outreach and awareness', page=19, position=2, new='##### Outreach and awareness', changed=True),
HeadingResult(old='## Assistance to stranded migrants', page=19, position=3, new='##### Assistance to stranded migrants', changed=True)],
20: [HeadingResult(old='# 4.3.2.2 Achievement of the Objective', page=20, position=1, new='#### 4.3.2.2 Achievement of the Objective', changed=True),
HeadingResult(old='### 4.3.3. Specific Objective 3: Returnees are sustainably integrated in host communities, and host communities are better able to create living standards that address drivers of migration.', page=20, position=2, new='### 4.3.3. Specific Objective 3: Returnees are sustainably integrated in host communities, and host communities are better able to create living standards that address drivers of migration.', changed=False),
HeadingResult(old='### 4.3.3.1 Achievement of outputs and results', page=20, position=3, new='#### 4.3.3.1 Achievement of outputs and results', changed=False),
HeadingResult(old='## Individual and community-based reintegration', page=20, position=4, new='##### Individual and community-based reintegration', changed=True)],
21: [HeadingResult(old='# M\\&E systems', page=21, position=1, new='##### M\\&E systems', changed=True)],
22: [HeadingResult(old='# 4.3.3.2 Achievement of Specific Objective 3', page=22, position=1, new='#### 4.3.3.2 Achievement of Specific Objective 3', changed=True),
HeadingResult(old='## Overall achievement of reintegration', page=22, position=2, new='##### Overall achievement of reintegration', changed=True)],
23: [HeadingResult(old='# Sustainability of reintegration', page=23, position=1, new='##### Sustainability of reintegration', changed=True),
HeadingResult(old='### 4.3.4. Functioning of the Integrated Approach', page=23, position=2, new='### 4.3.4. Functioning of the Integrated Approach', changed=False)],
24: [HeadingResult(old='# 4.4. Efficiency', page=24, position=1, new='## 4.4. Efficiency', changed=True),
HeadingResult(old='### 4.4.3. Did the programme receive sufficient resources to achieve its objectives?', page=24, position=2, new='### 4.4.3. Did the programme receive sufficient resources to achieve its objectives?', changed=False)],
25: [HeadingResult(old='# 4.4.2. Cost-effectiveness and efficiency of the programme', page=25, position=1, new='### 4.4.2. Cost-effectiveness and efficiency of the programme', changed=True)],
26: [HeadingResult(old='# 4.5. Sustainability', page=26, position=1, new='## 4.5. Sustainability', changed=True),
HeadingResult(old='## Overall performance score for sustainability: $2.5 / 5$ <br> Robustness score for the evidence: $4 / 5$', page=26, position=2, new='#### Overall performance score for sustainability: $2.5 / 5$ <br> Robustness score for the evidence: $4 / 5$', changed=True)],
27: [HeadingResult(old='# 5. Conclusions and Recommendations', page=27, position=1, new='# 5. Conclusions and Recommendations', changed=False),
HeadingResult(old='### 5.1. Conclusions', page=27, position=2, new='## 5.1. Conclusions', changed=False)],
28: [HeadingResult(old='# 5.2. Recommendations', page=28, position=1, new='## 5.2. Recommendations', changed=True)],
29: [HeadingResult(old='# 3. Increase attention on building partnerships with service providers who can function without (significant) funding channelled by IOM.', page=29, position=1, new='# 3. Increase attention on building partnerships with service providers who can function without (significant) funding channelled by IOM.', changed=True)],
30: [HeadingResult(old='# 5. Explore opportunities to extend the scope of support provided to returnees, with a focus on longer-term integration.', page=30, position=1, new='# 5. Explore opportunities to extend the scope of support provided to returnees, with a focus on longer-term integration.', changed=True)]}
apply_corrections_to_page (page_nb, corrections, pages_list, suffix='_fixed')
Apply corrections to a page by replacing original headings with corrected versions and page numbers
Type | Default | Details | |
---|---|---|---|
page_nb | Page number | ||
corrections | List of corrections | ||
pages_list | List of pages | ||
suffix | str | _fixed | Suffix for the new file |
def apply_corrections_to_page(
page_nb, # Page number
corrections, # List of corrections
pages_list, # List of pages
suffix=cfg.fixed_suffix, # Suffix for the new file
):
"Apply corrections to a page by replacing original headings with corrected versions and page numbers"
page_file = pages_list[page_nb - 1]
lines = page_file.read_text().splitlines()
corrections_copy = corrections.copy()
for i, line in enumerate(lines):
for correction in corrections_copy:
if line.strip() == correction.old.strip():
lines[i] = f"{correction.new} .... page {page_nb}"
corrections_copy.remove(correction)
break
new_file = page_file.with_stem(f"{page_file.stem}{suffix}")
new_file.write_text('\n'.join(lines))
apply_all_corrections (results, pages_list)
Apply all corrections to the pages
Details | |
---|---|
results | List of headings with corrections and change status |
pages_list | List of pages |
def apply_all_corrections(
results, # List of headings with corrections and change status
pages_list, # List of pages
):
"Apply all corrections to the pages"
grouped = group_corrections_by_page(results)
for page_nb, corrections in grouped.items():
apply_corrections_to_page(page_nb, corrections, pages_list)
fix_doc_hdgs (src_dir, force=False)
Process the document directory
Type | Default | Details | |
---|---|---|---|
src_dir | Path to the folder containing the document | ||
force | bool | False | Whether to overwrite the existing files |
def fix_doc_hdgs(
src_dir, # Path to the folder containing the document
force=False, # Whether to overwrite the existing files
):
"Process the document directory"
folder = Path(src_dir)
fixed_files = list(folder.glob(f"*{cfg.fixed_suffix}.md"))
if fixed_files and not force:
print(f"Found {len(fixed_files)} {cfg.fixed_suffix} files. Use force=True to overwrite.")
return
if fixed_files and force: [f.delete() for f in fixed_files]
pages = folder.ls(file_exts=".md").sorted(key=lambda p: int(p.stem.split('_')[1]))
result = fix_md(get_hdgs_with_pages(pages))
apply_all_corrections(result.results, pages)