Spaces:
Running
Running
File size: 1,692 Bytes
c227628 127cc6f ab82350 127cc6f ffca6e7 127cc6f ffca6e7 127cc6f ffca6e7 c227628 ffca6e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
"""Helper script to go from PDF to PNG ZIP Files we can use in HTML on the LB."""
from __future__ import annotations
import zipfile
from pathlib import Path
from multiprocessing import Pool, cpu_count
from pdf2image import convert_from_path
ROOT_DIR = Path("./data")
DPI = 800 # you can lower this if files are huge / too slow
def process_pdf(pdf_path_str: str) -> None:
pdf_path = Path(pdf_path_str).resolve()
zip_path = pdf_path.with_suffix(".png.zip")
print(f"Converting {pdf_path}...")
# Convert all pages of the PDF
images = convert_from_path(str(pdf_path), dpi=DPI)
# Save pages as PNGs (multi-page safe naming)
png_path = pdf_path.with_suffix(".png")
images[0].save(png_path, "PNG")
png_paths = [png_path]
# Zip all PNGs into one archive
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
for png_path in png_paths:
zipf.write(png_path, arcname=png_path.name)
# Clean up PNGs and original PDF
for png_path in png_paths:
png_path.unlink(missing_ok=True)
pdf_path.unlink(missing_ok=True)
def main() -> None:
pdf_paths = [str(p) for p in ROOT_DIR.rglob("*.pdf")]
if not pdf_paths:
print("No PDFs found.")
return
# Use one process per CPU, but not more than number of PDFs
n_procs = min(cpu_count(), len(pdf_paths))
print(f"Found {len(pdf_paths)} PDFs. Using {n_procs} processes.")
with Pool(processes=n_procs) as pool:
# imap_unordered gives you streaming results + simple progress printing
for _ in pool.imap_unordered(process_pdf, pdf_paths):
pass
if __name__ == "__main__":
main() |