Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Read, create, merge, split, watermark, encrypt, OCR, and fill PDF files using Python and CLI tools
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
reference.md
1# PDF Processing Advanced Reference23This document contains advanced PDF processing features, detailed examples, and additional libraries not covered in the main skill instructions.45## pypdfium2 Library (Apache/BSD License)67### Overview8pypdfium2 is a Python binding for PDFium (Chromium's PDF library). It's excellent for fast PDF rendering, image generation, and serves as a PyMuPDF replacement.910### Render PDF to Images11```python12import pypdfium2 as pdfium13from PIL import Image1415# Load PDF16pdf = pdfium.PdfDocument("document.pdf")1718# Render page to image19page = pdf[0] # First page20bitmap = page.render(21scale=2.0, # Higher resolution22rotation=0 # No rotation23)2425# Convert to PIL Image26img = bitmap.to_pil()27img.save("page_1.png", "PNG")2829# Process multiple pages30for i, page in enumerate(pdf):31bitmap = page.render(scale=1.5)32img = bitmap.to_pil()33img.save(f"page_{i+1}.jpg", "JPEG", quality=90)34```3536### Extract Text with pypdfium237```python38import pypdfium2 as pdfium3940pdf = pdfium.PdfDocument("document.pdf")41for i, page in enumerate(pdf):42text = page.get_text()43print(f"Page {i+1} text length: {len(text)} chars")44```4546## JavaScript Libraries4748### pdf-lib (MIT License)4950pdf-lib is a powerful JavaScript library for creating and modifying PDF documents in any JavaScript environment.5152#### Load and Manipulate Existing PDF53```javascript54import { PDFDocument } from 'pdf-lib';55import fs from 'fs';5657async function manipulatePDF() {58// Load existing PDF59const existingPdfBytes = fs.readFileSync('input.pdf');60const pdfDoc = await PDFDocument.load(existingPdfBytes);6162// Get page count63const pageCount = pdfDoc.getPageCount();64console.log(`Document has ${pageCount} pages`);6566// Add new page67const newPage = pdfDoc.addPage([600, 400]);68newPage.drawText('Added by pdf-lib', {69x: 100,70y: 300,71size: 1672});7374// Save modified PDF75const pdfBytes = await pdfDoc.save();76fs.writeFileSync('modified.pdf', pdfBytes);77}78```7980#### Create Complex PDFs from Scratch81```javascript82import { PDFDocument, rgb, StandardFonts } from 'pdf-lib';83import fs from 'fs';8485async function createPDF() {86const pdfDoc = await PDFDocument.create();8788// Add fonts89const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica);90const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold);9192// Add page93const page = pdfDoc.addPage([595, 842]); // A4 size94const { width, height } = page.getSize();9596// Add text with styling97page.drawText('Invoice #12345', {98x: 50,99y: height - 50,100size: 18,101font: helveticaBold,102color: rgb(0.2, 0.2, 0.8)103});104105// Add rectangle (header background)106page.drawRectangle({107x: 40,108y: height - 100,109width: width - 80,110height: 30,111color: rgb(0.9, 0.9, 0.9)112});113114// Add table-like content115const items = [116['Item', 'Qty', 'Price', 'Total'],117['Widget', '2', '$50', '$100'],118['Gadget', '1', '$75', '$75']119];120121let yPos = height - 150;122items.forEach(row => {123let xPos = 50;124row.forEach(cell => {125page.drawText(cell, {126x: xPos,127y: yPos,128size: 12,129font: helveticaFont130});131xPos += 120;132});133yPos -= 25;134});135136const pdfBytes = await pdfDoc.save();137fs.writeFileSync('created.pdf', pdfBytes);138}139```140141#### Advanced Merge and Split Operations142```javascript143import { PDFDocument } from 'pdf-lib';144import fs from 'fs';145146async function mergePDFs() {147// Create new document148const mergedPdf = await PDFDocument.create();149150// Load source PDFs151const pdf1Bytes = fs.readFileSync('doc1.pdf');152const pdf2Bytes = fs.readFileSync('doc2.pdf');153154const pdf1 = await PDFDocument.load(pdf1Bytes);155const pdf2 = await PDFDocument.load(pdf2Bytes);156157// Copy pages from first PDF158const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices());159pdf1Pages.forEach(page => mergedPdf.addPage(page));160161// Copy specific pages from second PDF (pages 0, 2, 4)162const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]);163pdf2Pages.forEach(page => mergedPdf.addPage(page));164165const mergedPdfBytes = await mergedPdf.save();166fs.writeFileSync('merged.pdf', mergedPdfBytes);167}168```169170### pdfjs-dist (Apache License)171172PDF.js is Mozilla's JavaScript library for rendering PDFs in the browser.173174#### Basic PDF Loading and Rendering175```javascript176import * as pdfjsLib from 'pdfjs-dist';177178// Configure worker (important for performance)179pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js';180181async function renderPDF() {182// Load PDF183const loadingTask = pdfjsLib.getDocument('document.pdf');184const pdf = await loadingTask.promise;185186console.log(`Loaded PDF with ${pdf.numPages} pages`);187188// Get first page189const page = await pdf.getPage(1);190const viewport = page.getViewport({ scale: 1.5 });191192// Render to canvas193const canvas = document.createElement('canvas');194const context = canvas.getContext('2d');195canvas.height = viewport.height;196canvas.width = viewport.width;197198const renderContext = {199canvasContext: context,200viewport: viewport201};202203await page.render(renderContext).promise;204document.body.appendChild(canvas);205}206```207208#### Extract Text with Coordinates209```javascript210import * as pdfjsLib from 'pdfjs-dist';211212async function extractText() {213const loadingTask = pdfjsLib.getDocument('document.pdf');214const pdf = await loadingTask.promise;215216let fullText = '';217218// Extract text from all pages219for (let i = 1; i <= pdf.numPages; i++) {220const page = await pdf.getPage(i);221const textContent = await page.getTextContent();222223const pageText = textContent.items224.map(item => item.str)225.join(' ');226227fullText += `\n--- Page ${i} ---\n${pageText}`;228229// Get text with coordinates for advanced processing230const textWithCoords = textContent.items.map(item => ({231text: item.str,232x: item.transform[4],233y: item.transform[5],234width: item.width,235height: item.height236}));237}238239console.log(fullText);240return fullText;241}242```243244#### Extract Annotations and Forms245```javascript246import * as pdfjsLib from 'pdfjs-dist';247248async function extractAnnotations() {249const loadingTask = pdfjsLib.getDocument('annotated.pdf');250const pdf = await loadingTask.promise;251252for (let i = 1; i <= pdf.numPages; i++) {253const page = await pdf.getPage(i);254const annotations = await page.getAnnotations();255256annotations.forEach(annotation => {257console.log(`Annotation type: ${annotation.subtype}`);258console.log(`Content: ${annotation.contents}`);259console.log(`Coordinates: ${JSON.stringify(annotation.rect)}`);260});261}262}263```264265## Advanced Command-Line Operations266267### poppler-utils Advanced Features268269#### Extract Text with Bounding Box Coordinates270```bash271# Extract text with bounding box coordinates (essential for structured data)272pdftotext -bbox-layout document.pdf output.xml273274# The XML output contains precise coordinates for each text element275```276277#### Advanced Image Conversion278```bash279# Convert to PNG images with specific resolution280pdftoppm -png -r 300 document.pdf output_prefix281282# Convert specific page range with high resolution283pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages284285# Convert to JPEG with quality setting286pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output287```288289#### Extract Embedded Images290```bash291# Extract all embedded images with metadata292pdfimages -j -p document.pdf page_images293294# List image info without extracting295pdfimages -list document.pdf296297# Extract images in their original format298pdfimages -all document.pdf images/img299```300301### qpdf Advanced Features302303#### Complex Page Manipulation304```bash305# Split PDF into groups of pages306qpdf --split-pages=3 input.pdf output_group_%02d.pdf307308# Extract specific pages with complex ranges309qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf310311# Merge specific pages from multiple PDFs312qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf313```314315#### PDF Optimization and Repair316```bash317# Optimize PDF for web (linearize for streaming)318qpdf --linearize input.pdf optimized.pdf319320# Remove unused objects and compress321qpdf --optimize-level=all input.pdf compressed.pdf322323# Attempt to repair corrupted PDF structure324qpdf --check input.pdf325qpdf --fix-qdf damaged.pdf repaired.pdf326327# Show detailed PDF structure for debugging328qpdf --show-all-pages input.pdf > structure.txt329```330331#### Advanced Encryption332```bash333# Add password protection with specific permissions334qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf335336# Check encryption status337qpdf --show-encryption encrypted.pdf338339# Remove password protection (requires password)340qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf341```342343## Advanced Python Techniques344345### pdfplumber Advanced Features346347#### Extract Text with Precise Coordinates348```python349import pdfplumber350351with pdfplumber.open("document.pdf") as pdf:352page = pdf.pages[0]353354# Extract all text with coordinates355chars = page.chars356for char in chars[:10]: # First 10 characters357print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}")358359# Extract text by bounding box (left, top, right, bottom)360bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text()361```362363#### Advanced Table Extraction with Custom Settings364```python365import pdfplumber366import pandas as pd367368with pdfplumber.open("complex_table.pdf") as pdf:369page = pdf.pages[0]370371# Extract tables with custom settings for complex layouts372table_settings = {373"vertical_strategy": "lines",374"horizontal_strategy": "lines",375"snap_tolerance": 3,376"intersection_tolerance": 15377}378tables = page.extract_tables(table_settings)379380# Visual debugging for table extraction381img = page.to_image(resolution=150)382img.save("debug_layout.png")383```384385### reportlab Advanced Features386387#### Create Professional Reports with Tables388```python389from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph390from reportlab.lib.styles import getSampleStyleSheet391from reportlab.lib import colors392393# Sample data394data = [395['Product', 'Q1', 'Q2', 'Q3', 'Q4'],396['Widgets', '120', '135', '142', '158'],397['Gadgets', '85', '92', '98', '105']398]399400# Create PDF with table401doc = SimpleDocTemplate("report.pdf")402elements = []403404# Add title405styles = getSampleStyleSheet()406title = Paragraph("Quarterly Sales Report", styles['Title'])407elements.append(title)408409# Add table with advanced styling410table = Table(data)411table.setStyle(TableStyle([412('BACKGROUND', (0, 0), (-1, 0), colors.grey),413('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),414('ALIGN', (0, 0), (-1, -1), 'CENTER'),415('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),416('FONTSIZE', (0, 0), (-1, 0), 14),417('BOTTOMPADDING', (0, 0), (-1, 0), 12),418('BACKGROUND', (0, 1), (-1, -1), colors.beige),419('GRID', (0, 0), (-1, -1), 1, colors.black)420]))421elements.append(table)422423doc.build(elements)424```425426## Complex Workflows427428### Extract Figures/Images from PDF429430#### Method 1: Using pdfimages (fastest)431```bash432# Extract all images with original quality433pdfimages -all document.pdf images/img434```435436#### Method 2: Using pypdfium2 + Image Processing437```python438import pypdfium2 as pdfium439from PIL import Image440import numpy as np441442def extract_figures(pdf_path, output_dir):443pdf = pdfium.PdfDocument(pdf_path)444445for page_num, page in enumerate(pdf):446# Render high-resolution page447bitmap = page.render(scale=3.0)448img = bitmap.to_pil()449450# Convert to numpy for processing451img_array = np.array(img)452453# Simple figure detection (non-white regions)454mask = np.any(img_array != [255, 255, 255], axis=2)455456# Find contours and extract bounding boxes457# (This is simplified - real implementation would need more sophisticated detection)458459# Save detected figures460# ... implementation depends on specific needs461```462463### Batch PDF Processing with Error Handling464```python465import os466import glob467from pypdf import PdfReader, PdfWriter468import logging469470logging.basicConfig(level=logging.INFO)471logger = logging.getLogger(__name__)472473def batch_process_pdfs(input_dir, operation='merge'):474pdf_files = glob.glob(os.path.join(input_dir, "*.pdf"))475476if operation == 'merge':477writer = PdfWriter()478for pdf_file in pdf_files:479try:480reader = PdfReader(pdf_file)481for page in reader.pages:482writer.add_page(page)483logger.info(f"Processed: {pdf_file}")484except Exception as e:485logger.error(f"Failed to process {pdf_file}: {e}")486continue487488with open("batch_merged.pdf", "wb") as output:489writer.write(output)490491elif operation == 'extract_text':492for pdf_file in pdf_files:493try:494reader = PdfReader(pdf_file)495text = ""496for page in reader.pages:497text += page.extract_text()498499output_file = pdf_file.replace('.pdf', '.txt')500with open(output_file, 'w', encoding='utf-8') as f:501f.write(text)502logger.info(f"Extracted text from: {pdf_file}")503504except Exception as e:505logger.error(f"Failed to extract text from {pdf_file}: {e}")506continue507```508509### Advanced PDF Cropping510```python511from pypdf import PdfWriter, PdfReader512513reader = PdfReader("input.pdf")514writer = PdfWriter()515516# Crop page (left, bottom, right, top in points)517page = reader.pages[0]518page.mediabox.left = 50519page.mediabox.bottom = 50520page.mediabox.right = 550521page.mediabox.top = 750522523writer.add_page(page)524with open("cropped.pdf", "wb") as output:525writer.write(output)526```527528## Performance Optimization Tips529530### 1. For Large PDFs531- Use streaming approaches instead of loading entire PDF in memory532- Use `qpdf --split-pages` for splitting large files533- Process pages individually with pypdfium2534535### 2. For Text Extraction536- `pdftotext -bbox-layout` is fastest for plain text extraction537- Use pdfplumber for structured data and tables538- Avoid `pypdf.extract_text()` for very large documents539540### 3. For Image Extraction541- `pdfimages` is much faster than rendering pages542- Use low resolution for previews, high resolution for final output543544### 4. For Form Filling545- pdf-lib maintains form structure better than most alternatives546- Pre-validate form fields before processing547548### 5. Memory Management549```python550# Process PDFs in chunks551def process_large_pdf(pdf_path, chunk_size=10):552reader = PdfReader(pdf_path)553total_pages = len(reader.pages)554555for start_idx in range(0, total_pages, chunk_size):556end_idx = min(start_idx + chunk_size, total_pages)557writer = PdfWriter()558559for i in range(start_idx, end_idx):560writer.add_page(reader.pages[i])561562# Process chunk563with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output:564writer.write(output)565```566567## Troubleshooting Common Issues568569### Encrypted PDFs570```python571# Handle password-protected PDFs572from pypdf import PdfReader573574try:575reader = PdfReader("encrypted.pdf")576if reader.is_encrypted:577reader.decrypt("password")578except Exception as e:579print(f"Failed to decrypt: {e}")580```581582### Corrupted PDFs583```bash584# Use qpdf to repair585qpdf --check corrupted.pdf586qpdf --replace-input corrupted.pdf587```588589### Text Extraction Issues590```python591# Fallback to OCR for scanned PDFs592import pytesseract593from pdf2image import convert_from_path594595def extract_text_with_ocr(pdf_path):596images = convert_from_path(pdf_path)597text = ""598for i, image in enumerate(images):599text += pytesseract.image_to_string(image)600return text601```602603## License Information604605- **pypdf**: BSD License606- **pdfplumber**: MIT License607- **pypdfium2**: Apache/BSD License608- **reportlab**: BSD License609- **poppler-utils**: GPL-2 License610- **qpdf**: Apache License611- **pdf-lib**: MIT License612- **pdfjs-dist**: Apache License