source
flatten_evals_id_doc
flatten_evals_id_doc (evals:fastcore.foundation.L, cfg:dict={'id_field':
'id', 'docs_field': 'docs', 'url_field': 'File
URL'})
Flatten evaluation records into list of (id, url) tuples
evals
L
List of evaluation records
cfg
dict
{‘id_field’: ‘id’, ‘docs_field’: ‘docs’, ‘url_field’: ‘File URL’}
Config dict with field names
Returns
L
Returns list of (id, url) tuples
Exported source
def flatten_evals_id_doc(
evals: L, # List of evaluation records
cfg: dict = default_config # Config dict with field names
) -> L: # Returns list of (id, url) tuples
"Flatten evaluation records into list of (id, url) tuples"
return L((eval_data[cfg['id_field' ]], doc[cfg['url_field' ]])
for eval_data in evals
for doc in eval_data[cfg['docs_field' ]])
fname = 'files/test/evaluations.json'
docs_to_download = flatten_evals_id_doc(load_evals(fname)); docs_to_download
(#24) [('1a57974ab89d7280988aa6b706147ce1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_NG20P0516_MAY_2023_FINAL_Abderrahim%20EL%20MOULAT.pdf'),('1a57974ab89d7280988aa6b706147ce1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/RR0163_Evaluation%20Brief_MAY_%202023_Abderrahim%20EL%20MOULAT.pdf'),('c660e774d14854e20dc74457712b50ec', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IB0238_Evaluation%20Brief_FEB_%202023_Abderrahim%20EL%20MOULAT.pdf'),('c660e774d14854e20dc74457712b50ec', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_IB0238__FEB_2023_FINAL%20RE_Abderrahim%20EL%20MOULAT.pdf'),('2cae361c6779b561af07200e3d4e4051', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IB0053_Evaluation%20Brief_SEP_%202022_Abderrahim%20EL%20MOULAT.pdf'),('2cae361c6779b561af07200e3d4e4051', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_IB0053_OCT_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf'),('a9dea21fd254df7759b3936903e0a885', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_NC0030_JUNE_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf'),('a9dea21fd254df7759b3936903e0a885', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/NC0030_Evaluation%20Brief_June%202022_Abderrahim%20EL%20MOULAT.pdf'),('f0b09b92ea8ad6dddd9623de68a8d278', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/CD0015_Evaluation%20Brief_May%202022_Abderrahim%20EL%20MOULAT.pdf'),('f0b09b92ea8ad6dddd9623de68a8d278', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Projet%20CD0015_Final%20Evaluation%20Report_May_202_Abderrahim%20EL%20MOULAT.pdf'),('0456b0faaea16715afdb96969c337bc1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_Retour%20Vert_JUL_2021_Fina_Abderrahim%20EL%20MOULAT.pdf'),('0456b0faaea16715afdb96969c337bc1', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/NC0012_Evaluation%20Brief_JUL%202021_Abderrahim%20EL%20MOULAT.pdf'),('d5d71db805eeae249d7a2cf381be05cb', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Nigeria%20GIZ%20Internal%20Evaluation_JANUARY_2021__Abderrahim%20EL%20MOULAT.pdf'),('d5d71db805eeae249d7a2cf381be05cb', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Nigeria%20GIZ%20Project_Evaluation%20Brief_JAN%202021_Abderrahim%20EL%20MOULAT_0.pdf'),('f365264e3f69efb4c61b0cecb374e0f4', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Evaluation%20Brief_ARCO_Shiraz%20JERBI.pdF'),('f365264e3f69efb4c61b0cecb374e0f4', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Final%20evaluation%20report_ARCO_Shiraz%20JERBI_1.pdf'),('f365264e3f69efb4c61b0cecb374e0f4', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Management%20Response%20Matrix_ARCO_Shiraz%20JERBI.pdf'),('93e51fcbedaaddd6a7e38fd8ee039614', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IOM%20MANAGEMENT%20RESPONSE%20MATRIX.pdf'),('93e51fcbedaaddd6a7e38fd8ee039614', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/IOM%20Niger%20-%20MIRAA%20III%20-%20Final%20Evaluation%20Report%20%28003%29.pdf'),('5efee25fe816456e0af729cb91896a38', 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/CE.0369%20-%20IDEE%20-%20ANNEXE%201%20-%20Rapport%20Recherche_Joanie%20DUROCHER_0.pdf')...]
source
mk_dirs
mk_dirs (docs_to_download:fastcore.foundation.L, base_dir:pathlib.Path)
Create directories for each unique evaluation ID
docs_to_download
L
list of (eval_id, url) tuples
base_dir
Path
path to the base directory
Returns
None
Exported source
def mk_dirs(
docs_to_download: L, # list of (eval_id, url) tuples
base_dir: Path # path to the base directory
) -> None :
"Create directories for each unique evaluation ID"
unique_eval_ids = set (eval_id for eval_id, url in docs_to_download)
for eval_id in unique_eval_ids:
(base_dir / eval_id).mkdir(parents= True , exist_ok= True )
base_dir = Path("files/test/pdf_library" )
mk_dirs(docs_to_download[:2 ], base_dir)
source
download_doc
download_doc (doc_info:tuple[str,str], base_dir:pathlib.Path,
overwrite:bool=True)
Download a document from a given URL and save it to a specified directory.
doc_info
tuple
(eval_id, url) for the document to download
base_dir
Path
Base directory to save files to
overwrite
bool
True
If True, overwrite existing file
Returns
str
Exported source
def download_doc(
doc_info: tuple [str , str ], # (eval_id, url) for the document to download
base_dir: Path, # Base directory to save files to
overwrite: bool = True # If True, overwrite existing file
) -> str :
"Download a document from a given URL and save it to a specified directory."
eval_id, url = doc_info
fname = extract_fname(url)
path = base_dir / eval_id / fname
if path.exists() and not overwrite:
return f"Skipped { fname} (already exists)"
try :
r = requests.get(url, timeout= 15 )
r.raise_for_status()
with open (path, 'wb' ) as f:
f.write(r.content)
return f"Downloaded { fname} "
except Exception as e:
return f"Failed to download { fname} : { e} "
docs_to_download = flatten_evals_id_doc(load_evals(fname))
print (docs_to_download[:2 ])
[( '1a57974ab89d7280988aa6b706147ce1' ,
'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Internal%20Evaluation_NG20P0516_MAY_2023_F
INAL_Abderrahim%20EL%20MOULAT.pdf' ) , ( '1a57974ab89d7280988aa6b706147ce1' ,
'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/RR0163_Evaluation%20Brief_MAY_%202023_Abde
rrahim%20EL%20MOULAT.pdf' )]
eval_id, url = docs_to_download[0 ]
download_doc((eval_id, url), base_dir= base_dir, overwrite= False )
'Downloaded Internal%20Evaluation_NG20P0516_MAY_2023_FINAL_Abderrahim%20EL%20MOULAT.pdf'
source
download_docs
download_docs (json_file:str, base_dir:pathlib.Path=Path('PDF_Library'),
n_workers:int=5, overwrite:bool=False,
cfg:dict={'id_field': 'id', 'docs_field': 'docs',
'url_field': 'File URL'})
Download all evaluation documents from a JSON file in parallel
json_file
str
path to the JSON file
base_dir
Path
PDF_Library
path to the base directory
n_workers
int
5
number of workers
overwrite
bool
False
if True, overwrite existing files
cfg
dict
{‘id_field’: ‘id’, ‘docs_field’: ‘docs’, ‘url_field’: ‘File URL’}
config for the JSON file
Returns
list
Exported source
def download_docs(
json_file: str , # path to the JSON file
base_dir: Path = Path("./PDF_Library" ), # path to the base directory
n_workers: int = 5 , # number of workers
overwrite: bool = False , # if True, overwrite existing files
cfg: dict = default_config # config for the JSON file
) -> list :
"Download all evaluation documents from a JSON file in parallel"
docs_to_download = flatten_evals_id_doc(load_evals(json_file))
mk_dirs(docs_to_download, base_dir)
download_func = partial(download_doc, base_dir= base_dir, overwrite= overwrite)
results = parallel(download_func,
docs_to_download,
n_workers= n_workers,
total= len (docs_to_download),
progress= True )
return results
fname = 'files/test/evaluations.json'
base_dir = Path("files/test/pdf_library" )
download_docs(fname, base_dir= base_dir, n_workers= 0 , overwrite= True )
(#24) ['Downloaded Internal%20Evaluation_NG20P0516_MAY_2023_FINAL_Abderrahim%20EL%20MOULAT.pdf','Downloaded RR0163_Evaluation%20Brief_MAY_%202023_Abderrahim%20EL%20MOULAT.pdf','Downloaded IB0238_Evaluation%20Brief_FEB_%202023_Abderrahim%20EL%20MOULAT.pdf','Downloaded Internal%20Evaluation_IB0238__FEB_2023_FINAL%20RE_Abderrahim%20EL%20MOULAT.pdf','Downloaded IB0053_Evaluation%20Brief_SEP_%202022_Abderrahim%20EL%20MOULAT.pdf','Downloaded Internal%20Evaluation_IB0053_OCT_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf','Downloaded Internal%20Evaluation_NC0030_JUNE_2022_FINAL_Abderrahim%20EL%20MOULAT_0.pdf','Downloaded NC0030_Evaluation%20Brief_June%202022_Abderrahim%20EL%20MOULAT.pdf','Downloaded CD0015_Evaluation%20Brief_May%202022_Abderrahim%20EL%20MOULAT.pdf','Downloaded Projet%20CD0015_Final%20Evaluation%20Report_May_202_Abderrahim%20EL%20MOULAT.pdf','Downloaded Internal%20Evaluation_Retour%20Vert_JUL_2021_Fina_Abderrahim%20EL%20MOULAT.pdf','Downloaded NC0012_Evaluation%20Brief_JUL%202021_Abderrahim%20EL%20MOULAT.pdf','Downloaded Nigeria%20GIZ%20Internal%20Evaluation_JANUARY_2021__Abderrahim%20EL%20MOULAT.pdf','Downloaded Nigeria%20GIZ%20Project_Evaluation%20Brief_JAN%202021_Abderrahim%20EL%20MOULAT_0.pdf','Downloaded Evaluation%20Brief_ARCO_Shiraz%20JERBI.pdF','Downloaded Final%20evaluation%20report_ARCO_Shiraz%20JERBI_1.pdf','Downloaded Management%20Response%20Matrix_ARCO_Shiraz%20JERBI.pdf','Downloaded IOM%20MANAGEMENT%20RESPONSE%20MATRIX.pdf','Downloaded IOM%20Niger%20-%20MIRAA%20III%20-%20Final%20Evaluation%20Report%20%28003%29.pdf','Downloaded CE.0369%20-%20IDEE%20-%20ANNEXE%201%20-%20Rapport%20Recherche_Joanie%20DUROCHER_0.pdf'...]