Downloaders

Downloaders are responsible for downloading evaluation repositories from various sources (e.g. IOM, UNHCR).

flatten_evals_id_doc

 flatten_evals_id_doc (evals:fastcore.foundation.L, cfg:dict={'id_field':
                       'id', 'docs_field': 'docs', 'url_field': 'File
                       URL'})

Flatten evaluation records into list of (id, url) tuples

	Type	Default	Details
evals	L		List of evaluation records
cfg	dict	{‘id_field’: ‘id’, ‘docs_field’: ‘docs’, ‘url_field’: ‘File URL’}	Config dict with field names
Returns	L		Returns list of (id, url) tuples

Exported source

def flatten_evals_id_doc(
    evals: L,  # List of evaluation records
    cfg: dict = default_config  # Config dict with field names
) -> L:  # Returns list of (id, url) tuples
    "Flatten evaluation records into list of (id, url) tuples"
    return L((eval_data[cfg['id_field']], doc[cfg['url_field']]) 
             for eval_data in evals 
             for doc in eval_data[cfg['docs_field']])

fname = 'files/test/evaluations.json'
docs_to_download = flatten_evals_id_doc(load_evals(fname)); docs_to_download

(#24) [('1a57974ab89d7280988aa6b706147ce1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_NG20P0516_MAY_2023_FINAL_Abderrahim%20EL%20MOULAT.pdf'),('1a57974ab89d7280988aa6b706147ce1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/RR0163_Evaluation%20Brief_MAY_%202023_Abderrahim%20EL%20MOULAT.pdf'),('c660e774d14854e20dc74457712b50ec', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IB0238_Evaluation%20Brief_FEB_%202023_Abderrahim%20EL%20MOULAT.pdf'),('c660e774d14854e20dc74457712b50ec', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_IB0238__FEB_2023_FINAL%20RE_Abderrahim%20EL%20MOULAT.pdf'),('2cae361c6779b561af07200e3d4e4051', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IB0053_Evaluation%20Brief_SEP_%202022_Abderrahim%20EL%20MOULAT.pdf'),('2cae361c6779b561af07200e3d4e4051', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_IB0053_OCT_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf'),('a9dea21fd254df7759b3936903e0a885', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_NC0030_JUNE_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf'),('a9dea21fd254df7759b3936903e0a885', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/NC0030_Evaluation%20Brief_June%202022_Abderrahim%20EL%20MOULAT.pdf'),('f0b09b92ea8ad6dddd9623de68a8d278', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/CD0015_Evaluation%20Brief_May%202022_Abderrahim%20EL%20MOULAT.pdf'),('f0b09b92ea8ad6dddd9623de68a8d278', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Projet%20CD0015_Final%20Evaluation%20Report_May_202_Abderrahim%20EL%20MOULAT.pdf'),('0456b0faaea16715afdb96969c337bc1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_Retour%20Vert_JUL_2021_Fina_Abderrahim%20EL%20MOULAT.pdf'),('0456b0faaea16715afdb96969c337bc1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/NC0012_Evaluation%20Brief_JUL%202021_Abderrahim%20EL%20MOULAT.pdf'),('d5d71db805eeae249d7a2cf381be05cb', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Nigeria%20GIZ%20Internal%20Evaluation_JANUARY_2021__Abderrahim%20EL%20MOULAT.pdf'),('d5d71db805eeae249d7a2cf381be05cb', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Nigeria%20GIZ%20Project_Evaluation%20Brief_JAN%202021_Abderrahim%20EL%20MOULAT_0.pdf'),('f365264e3f69efb4c61b0cecb374e0f4', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Evaluation%20Brief_ARCO_Shiraz%20JERBI.pdF'),('f365264e3f69efb4c61b0cecb374e0f4', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Final%20evaluation%20report_ARCO_Shiraz%20JERBI_1.pdf'),('f365264e3f69efb4c61b0cecb374e0f4', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Management%20Response%20Matrix_ARCO_Shiraz%20JERBI.pdf'),('93e51fcbedaaddd6a7e38fd8ee039614', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IOM%20MANAGEMENT%20RESPONSE%20MATRIX.pdf'),('93e51fcbedaaddd6a7e38fd8ee039614', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IOM%20Niger%20-%20MIRAA%20III%20-%20Final%20Evaluation%20Report%20%28003%29.pdf'),('5efee25fe816456e0af729cb91896a38', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/CE.0369%20-%20IDEE%20-%20ANNEXE%201%20-%20Rapport%20Recherche_Joanie%20DUROCHER_0.pdf')...]

source

mk_dirs

 mk_dirs (docs_to_download:fastcore.foundation.L, base_dir:pathlib.Path)

Create directories for each unique evaluation ID

	Type	Details
docs_to_download	L	list of (eval_id, url) tuples
base_dir	Path	path to the base directory
Returns	None

Exported source

def mk_dirs(
    docs_to_download: L, # list of (eval_id, url) tuples
    base_dir: Path # path to the base directory
    ) -> None:
    "Create directories for each unique evaluation ID"
    unique_eval_ids = set(eval_id for eval_id, url in docs_to_download)
    for eval_id in unique_eval_ids:
        (base_dir / eval_id).mkdir(parents=True, exist_ok=True)

base_dir = Path("files/test/pdf_library")
mk_dirs(docs_to_download[:2], base_dir)

source

extract_fname

 extract_fname (url:str)

Exported source

def extract_fname(url: str) -> str: 
    parsed_url = urlparse(url)
    fname = Path(parsed_url.path).name
    return fname

source

download_doc

 download_doc (doc_info:tuple[str,str], base_dir:pathlib.Path,
               overwrite:bool=True)

Download a document from a given URL and save it to a specified directory.

	Type	Default	Details
doc_info	tuple		(eval_id, url) for the document to download
base_dir	Path		Base directory to save files to
overwrite	bool	True	If True, overwrite existing file
Returns	str

Exported source

def download_doc(
    doc_info: tuple[str, str],  # (eval_id, url) for the document to download
    base_dir: Path, # Base directory to save files to
    overwrite: bool = True # If True, overwrite existing file
) -> str:
    "Download a document from a given URL and save it to a specified directory."
    eval_id, url = doc_info    
    
    fname = extract_fname(url)
    path = base_dir / eval_id / fname
    
    if path.exists() and not overwrite: 
        return f"Skipped {fname} (already exists)"
    
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        
        with open(path, 'wb') as f:
            f.write(r.content)
            
        return f"Downloaded {fname}"
    except Exception as e:
        return f"Failed to download {fname}: {e}"

docs_to_download = flatten_evals_id_doc(load_evals(fname))
print(docs_to_download[:2])

[('1a57974ab89d7280988aa6b706147ce1', 
'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_NG20P0516_MAY_2023_F
INAL_Abderrahim%20EL%20MOULAT.pdf'), ('1a57974ab89d7280988aa6b706147ce1', 
'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/RR0163_Evaluation%20Brief_MAY_%202023_Abde
rrahim%20EL%20MOULAT.pdf')]

eval_id, url = docs_to_download[0]
download_doc((eval_id, url), base_dir=base_dir, overwrite=False)

'Downloaded Internal%20Evaluation_NG20P0516_MAY_2023_FINAL_Abderrahim%20EL%20MOULAT.pdf'

source

download_docs

 download_docs (json_file:str, base_dir:pathlib.Path=Path('PDF_Library'),
                n_workers:int=5, overwrite:bool=False,
                cfg:dict={'id_field': 'id', 'docs_field': 'docs',
                'url_field': 'File URL'})

Download all evaluation documents from a JSON file in parallel

	Type	Default	Details
json_file	str		path to the JSON file
base_dir	Path	PDF_Library	path to the base directory
n_workers	int	5	number of workers
overwrite	bool	False	if True, overwrite existing files
cfg	dict	{‘id_field’: ‘id’, ‘docs_field’: ‘docs’, ‘url_field’: ‘File URL’}	config for the JSON file
Returns	list

Exported source

def download_docs(
    json_file: str, # path to the JSON file
    base_dir: Path = Path("./PDF_Library"), # path to the base directory
    n_workers: int = 5, # number of workers
    overwrite: bool = False, # if True, overwrite existing files
    cfg: dict = default_config # config for the JSON file
) -> list:
    "Download all evaluation documents from a JSON file in parallel"
    docs_to_download = flatten_evals_id_doc(load_evals(json_file))
    mk_dirs(docs_to_download, base_dir)
    download_func = partial(download_doc, base_dir=base_dir, overwrite=overwrite)
    results = parallel(download_func, 
                      docs_to_download, 
                      n_workers=n_workers,
                      total=len(docs_to_download),
                      progress=True)
    
    return results

fname = 'files/test/evaluations.json'
base_dir = Path("files/test/pdf_library")
download_docs(fname, base_dir=base_dir, n_workers=0, overwrite=True)

(#24) ['Downloaded Internal%20Evaluation_NG20P0516_MAY_2023_FINAL_Abderrahim%20EL%20MOULAT.pdf','Downloaded RR0163_Evaluation%20Brief_MAY_%202023_Abderrahim%20EL%20MOULAT.pdf','Downloaded IB0238_Evaluation%20Brief_FEB_%202023_Abderrahim%20EL%20MOULAT.pdf','Downloaded Internal%20Evaluation_IB0238__FEB_2023_FINAL%20RE_Abderrahim%20EL%20MOULAT.pdf','Downloaded IB0053_Evaluation%20Brief_SEP_%202022_Abderrahim%20EL%20MOULAT.pdf','Downloaded Internal%20Evaluation_IB0053_OCT_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf','Downloaded Internal%20Evaluation_NC0030_JUNE_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf','Downloaded NC0030_Evaluation%20Brief_June%202022_Abderrahim%20EL%20MOULAT.pdf','Downloaded CD0015_Evaluation%20Brief_May%202022_Abderrahim%20EL%20MOULAT.pdf','Downloaded Projet%20CD0015_Final%20Evaluation%20Report_May_202_Abderrahim%20EL%20MOULAT.pdf','Downloaded Internal%20Evaluation_Retour%20Vert_JUL_2021_Fina_Abderrahim%20EL%20MOULAT.pdf','Downloaded NC0012_Evaluation%20Brief_JUL%202021_Abderrahim%20EL%20MOULAT.pdf','Downloaded Nigeria%20GIZ%20Internal%20Evaluation_JANUARY_2021__Abderrahim%20EL%20MOULAT.pdf','Downloaded Nigeria%20GIZ%20Project_Evaluation%20Brief_JAN%202021_Abderrahim%20EL%20MOULAT_0.pdf','Downloaded Evaluation%20Brief_ARCO_Shiraz%20JERBI.pdF','Downloaded Final%20evaluation%20report_ARCO_Shiraz%20JERBI_1.pdf','Downloaded Management%20Response%20Matrix_ARCO_Shiraz%20JERBI.pdf','Downloaded IOM%20MANAGEMENT%20RESPONSE%20MATRIX.pdf','Downloaded IOM%20Niger%20-%20MIRAA%20III%20-%20Final%20Evaluation%20Report%20%28003%29.pdf','Downloaded CE.0369%20-%20IDEE%20-%20ANNEXE%201%20-%20Rapport%20Recherche_Joanie%20DUROCHER_0.pdf'...]