cambridge-core-merge/cambridge_core_merge.py
2025-04-18 02:26:25 -05:00

85 lines
2.7 KiB
Python
Executable file

#!/usr/bin/env python
# SPDX-FileCopyrightText: 2025 Sayantan Santra <sayantan.santra689@gmail.com>
# SPDX-License-Identifier: GPL-3.0-only
from pypdf import PdfWriter, PdfReader
from zipfile import ZipFile
from argparse import ArgumentParser
from roman import fromRoman
from PIL import Image
from tempfile import TemporaryFile
def main():
parser = ArgumentParser(
prog="cambridge-core-merge",
description="A python script to merge books downloaded from Cambridge Core into a single PDF file.",
epilog="Copyright (C) 2025 Sayantan Santra",
)
parser.add_argument(
"-z",
"--zipfile",
required=True,
help="The path of the zip file obtained from Cambridge Core.",
)
parser.add_argument("-n", "--name", required=True, help="The path of the final PDF file.")
parser.add_argument("-c", "--cover", help="The path of the cover file. (Must be JPG.)")
args = parser.parse_args()
zip = ZipFile(args.zipfile, "r")
filelist = zip.namelist()
filelist.sort()
merger = PdfWriter()
first_bookmark = True
for file in filelist:
print("Adding " + file)
parts = file.split("_", 4)
name = parts[4][:-4].replace("_", " ")
start_index = merger.get_num_pages()
with zip.open(file) as pdf:
merger.append(pdf)
if first_bookmark and args.cover is not None:
cover = Image.open(args.cover, "r")
h = merger.pages[0].mediabox.height
w = merger.pages[0].mediabox.width
# Note: UserUnit is 1/72 inch by default and our target dpi is 300
h = int(h * 300 / 72)
w = int(w * 300 / 72)
cover = cover.resize([w, h])
temp = TemporaryFile()
cover.save(temp, "PDF", dpi=[300, 300])
merger.insert_page(PdfReader(temp).get_page(0), 0)
merger.add_outline_item("Cover", 0)
start_index += 1
merger.set_page_label(0, 0, "/a", prefix="cove", start=18)
first_bookmark = False
temp.close()
merger.add_outline_item(name, start_index)
startpage_str = parts[2]
endpage_str = parts[3]
pagestyle = "/D"
startpage, endpage = None, None
try:
startpage = int(startpage_str)
endpage = int(endpage_str)
except ValueError:
pagestyle = "/r"
startpage = fromRoman(startpage_str)
endpage = fromRoman(endpage_str)
merger.set_page_label(
start_index,
start_index + endpage - startpage,
style=pagestyle,
start=startpage,
)
merger.write(args.name + ".pdf")
merger.close()
if __name__ == "__main__":
main()