Version 2.3.2 (Complete refactor)

2025-03-08 06:29:14 -05:00 · 2025-03-08 06:29:14 -05:00 · 9963a25e4a
commit 9963a25e4a
parent dcb57e5a2b
7 changed files with 985 additions and 89 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,10 @@
+.gitignore
+.dockerignore
+.git
+.archive
+.env
+.env.*
+logs
+Dockerfile
+README.md
+docker-compose.yaml
--- a/80
+++ b/80
@ -1,14 +1,72 @@
-FROM archlinux:base
-
-# Update and install necessary packages
-RUN pacman -Sy --noconfirm curl reflector && \
-    reflector --latest 5 --sort rate --save /etc/pacman.d/mirrorlist && \
-    sed -i '/\[options\]/a XferCommand = /usr/bin/curl -C - --fail --retry 3 --retry-delay 3 -o %o %u' /etc/pacman.conf && \
-    pacman -Syu --noconfirm --needed texlive-basic texlive-bibtexextra texlive-bin texlive-binextra texlive-context texlive-fontsrecommended texlive-fontsextra texlive-fontutils texlive-formatsextra texlive-langenglish texlive-langeuropean texlive-langfrench texlive-langgerman texlive-latex texlive-latexextra texlive-latexrecommended texlive-luatex texlive-mathscience texlive-metapost texlive-music texlive-pictures texlive-plaingeneric texlive-pstricks texlive-publishers && \
-    pacman -Syu --noconfirm --needed python-fastapi uvicorn python-python-multipart && \
-    yes | pacman -Scc
+FROM python:3.11-slim AS builder

+# Set working directory
 WORKDIR /app
-COPY main.py .

-ENTRYPOINT ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip wheel --no-cache-dir --wheel-dir /app/wheels -r requirements.txt
+
+
+# Create final image
+FROM python:3.11-slim AS final
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app \
+    PORT=8000 \
+    MAX_WORKERS=4 \
+    TZ=UTC \
+    JOBS_DIR=/data/jobs \
+    DB_PATH=/data/db/jobs.db
+
+# Create a non-root user
+RUN groupadd -r appuser && useradd -r -g appuser appuser
+
+# Install LaTeX and required dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    texlive-full \
+    tini \
+    sqlite3 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Create persistent directories
+RUN mkdir -p /data/jobs /data/db /app/temp && \
+    chown -R appuser:appuser /app /data
+
+# Copy Python wheels from builder stage
+COPY --from=builder /app/wheels /wheels
+
+# Install Python dependencies
+RUN pip install --no-cache-dir /wheels/* && rm -rf /wheels
+
+# Copy application code
+COPY . /app/
+
+# Create entrypoint script that properly handles environment variables
+RUN echo '#!/bin/sh\n\
+exec uvicorn main:app --host 0.0.0.0 --port $PORT --workers $MAX_WORKERS --log-level info\n\
+' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
+
+# Switch to non-root user
+USER appuser
+
+# Expose the service port
+EXPOSE 8000
+
+# Use tini as init
+ENTRYPOINT ["/usr/bin/tini", "--"]
+
+# Run the application with proper signal handling
+CMD ["/app/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@ -1,58 +1,213 @@
-# LaTeX-to-PDF Conversion Service
+# LaTeX to PDF Conversion Service

-This service provides an API endpoint to convert LaTeX documents into PDF format. It supports `.zip` file uploads containing the LaTeX source file (`main.tex`) and any associated files (e.g., images or additional `.tex` files).
+A high-performance, secure REST API for converting LaTeX documents to PDF format.

-## Getting Started
+## Features

-### Building the Docker Image
+- **Simple API**: Upload a ZIP file containing LaTeX documents and get a PDF back
+- **Secure Processing**: Comprehensive security measures including input validation and sanitization
+- **Multiple Workers**: Designed for concurrency with shared file system and SQLite database
+- **Robust Error Handling**: Detailed error messages with LaTeX compilation logs
+- **Automatic Cleanup**: Background process removes expired PDFs and temporary files
+- **Configurable Options**: Multiple compilation runs, BibTeX support, custom main file name
+- **API Key Authentication**: Optional security layer with configurable API keys
+- **Rate Limiting**: Protection against API abuse
+- **Resource Control**: Limits on file sizes and compilation time
+- **Docker Ready**: Ready-to-use Docker and Docker Compose configurations

-To build the Docker image for the conversion service, navigate to the project directory and run:
+## Quick Start
+
+The easiest way to run the service is with Docker Compose:

 ```bash
-docker build -t rbehzadan/tex2pdf .
+# Clone the repository
+git clone https://github.com/yourusername/tex2pdf.git
+cd tex2pdf
+
+# Start the service
+docker-compose up -d
 ```

-### Running the Service
+The service will be available at `http://localhost:8000`.

-After building the image, you can start the service with the following command:
+## API Usage
+
+### Convert LaTeX to PDF

 ```bash
-docker run -d -p 8000:8000 rbehzadan/tex2pdf
+curl -X POST \
+  -H "X-API-Key: 1234" \
+  -F "zip_file=@my_latex_files.zip" \
+  http://localhost:8000/tex2pdf
 ```

-This command runs the Docker container in the background (`-d`) and maps port `8000` of the container to port `8000` on the host, making the service accessible at `http://localhost:8000`.
+Response:
+```json
+{
+  "job_id": "28f5bf9b-587f-4f3c-a3de-4d737d9736ce",
+  "status": "processing",
+  "message": "Conversion job started"
+}
+```

-## API Endpoint
-
-The service exposes a single POST endpoint at `/tex2pdf` for converting LaTeX to PDF.
-
-### Uploading a `.zip` File
-
-The `.zip` file should contain a `main.tex` file and can include additional resources such as images or other `.tex` files used by `main.tex`.
-
-## Manual Testing
-
-### Testing with `curl`
-
-To test the conversion service with `curl`, use the following command:
+### Check Job Status

 ```bash
-curl -X POST -F "zip_file=@path/to/your/file.zip" http://localhost:8000/tex2pdf -o output.pdf
+curl -X GET \
+  -H "X-API-Key: 1234" \
+  http://localhost:8000/tex2pdf/status/28f5bf9b-587f-4f3c-a3de-4d737d9736ce
 ```

-Replace `path/to/your/file.zip` with the actual path to your `.zip` file. The resulting PDF will be saved as `output.pdf` in the current directory.
+Response:
+```json
+{
+  "job_id": "28f5bf9b-587f-4f3c-a3de-4d737d9736ce",
+  "status": "completed",
+  "created_at": 1741424390.6039968
+}
+```

-### Testing with HTTPie
-
-HTTPie offers a more user-friendly way to test the service. Use the following command for testing:
+### Download PDF

 ```bash
-http -f POST http://localhost:8000/tex2pdf zip_file@path/to/your/file.zip > output.pdf
+curl -X GET \
+  -H "X-API-Key: 1234" \
+  -o output.pdf \
+  http://localhost:8000/tex2pdf/download/28f5bf9b-587f-4f3c-a3de-4d737d9736ce
 ```

-As with `curl`, replace `path/to/your/file.zip` with the path to your `.zip` file. The output will be redirected to `output.pdf` in the current directory.
+### Health Check

-## Troubleshooting
+```bash
+curl http://localhost:8000/health
+```

-If you encounter any issues with the conversion process, ensure that your `.zip` file is structured correctly, with a `main.tex` file at the root. For more detailed error information, consult the service logs.
+Response:
+```json
+{
+  "status": "healthy",
+  "version": "1.0.0",
+  "database": "connected",
+  "storage": true
+}
+```
+
+## Advanced Usage
+
+### Compilation Options
+
+You can customize the LaTeX compilation process:
+
+```bash
+curl -X POST \
+  -H "X-API-Key: 1234" \
+  -F "zip_file=@my_latex_files.zip" \
+  -F "options={\"main_file\": \"document.tex\", \"num_runs\": 3, \"use_bibtex\": true}" \
+  http://localhost:8000/tex2pdf
+```
+
+Options:
+- `main_file`: Main LaTeX file to compile (default: `main.tex`)
+- `num_runs`: Number of compilation runs (default: 2)
+- `use_bibtex`: Run BibTeX for bibliography processing (default: false)
+
+## ZIP File Requirements
+
+- The ZIP file must contain all necessary files for LaTeX compilation
+- By default, the service looks for `main.tex` as the main file
+- All referenced files (images, styles, etc.) should be included
+- Paths in LaTeX files should be relative and match the ZIP structure
+
+## Configuration
+
+The service can be configured via environment variables in the docker-compose.yml file:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `ALLOWED_API_KEYS` | Comma-separated list of valid API keys | "" (empty = no auth) |
+| `API_KEY_REQUIRED` | Enable/disable API key validation | "true" |
+| `MAX_WORKERS` | Number of uvicorn workers | 2 |
+| `MAX_UPLOAD_SIZE` | Maximum file upload size in bytes | 52428800 (50MB) |
+| `MAX_COMPILATION_TIME` | Maximum LaTeX compilation time in seconds | 240 |
+| `RATE_LIMIT_WINDOW` | Rate limiting window in seconds | 60 |
+| `MAX_REQUESTS_PER_WINDOW` | Maximum requests per rate limit window | 10 |
+| `JOB_EXPIRY` | Job expiry time in seconds | 3600 (1 hour) |
+| `JOBS_DIR` | Directory for storing PDF files | "/data/jobs" |
+| `DB_PATH` | Path to SQLite database | "/data/db/jobs.db" |
+
+## Deployment
+
+### System Requirements
+
+- Docker and Docker Compose
+- For running without Docker:
+  - Python 3.10+
+  - LaTeX distribution (texlive)
+  - SQLite3
+
+### Production Deployment Considerations
+
+For production deployments, consider:
+
+1. **Configure a reverse proxy** (like Nginx) with HTTPS
+2. **Adjust resource limits** based on your workload
+3. **Set strong API keys** and restrict access
+4. **Mount persistent volumes** for job data
+5. **Monitor disk usage** and adjust `JOB_EXPIRY` accordingly
+6. **Set up logging** to a centralized logging service
+
+## Architecture
+
+The service uses a stateless design with background processing:
+
+1. **FastAPI Application**: Handles HTTP requests and responses
+2. **SQLite Database**: Stores job metadata and status
+3. **File System**: Stores generated PDFs and temporary files
+4. **Background Tasks**: Process LaTeX compilation asynchronously
+
+## Development
+
+### Local Development Setup
+
+```bash
+# Clone the repository
+git clone https://github.com/yourusername/tex2pdf.git
+cd tex2pdf
+
+# Create a virtual environment
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+
+# Install dependencies
+pip install -r requirements.txt
+
+# Run the service
+uvicorn main:app --reload --host 0.0.0.0 --port 8000
+```
+
+### Running Tests
+
+```bash
+pytest tests/
+```
+
+## License
+
+[MIT License](LICENSE)
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit a Pull Request.
+
+## Security Considerations
+
+While this service implements several security measures:
+
+- API key authentication
+- Input validation
+- Rate limiting
+- Safe ZIP extraction
+- Process isolation
+
+Be aware that allowing users to run LaTeX compilation on your server carries inherent risks. Always deploy behind a secure gateway in production environments.

--- a/1
+++ b/1
@ -0,0 +1 @@
+2.3.3
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,21 @@
+services:
+  app:
+    image: rbehzadan/tex2pdf
+    container_name: tex2pdf
+    ports:
+      - "8000:8000"
+    environment:
+      - ALLOWED_API_KEYS=1234,5678,abcd  # Comma-separated list of allowed API keys
+      - API_KEY_REQUIRED=true            # Set to "false" to disable API key validation
+      - MAX_WORKERS=4                    # Number of uvicorn workers
+      - MAX_UPLOAD_SIZE=52428800         # 50MB in bytes
+      - MAX_COMPILATION_TIME=240         # Maximum LaTeX compilation time in seconds
+      - RATE_LIMIT_WINDOW=60             # Rate limiting window in seconds
+      - MAX_REQUESTS_PER_WINDOW=10       # Maximum requests per rate limit window
+      - JOB_EXPIRY=3600                  # Job expiry time in seconds (1 hour)
+    volumes:
+      - pdf_data:/data
+    restart: unless-stopped
+
+volumes:
+  pdf_data:
--- a/main.py
+++ b/main.py
@ -1,60 +1,706 @@
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import StreamingResponse
+from fastapi import FastAPI, File, UploadFile, HTTPException, Depends, BackgroundTasks, Request
+from fastapi.responses import StreamingResponse, FileResponse
 from io import BytesIO
 import asyncio
 import tempfile
 import zipfile
 import os
+import logging
+import shutil
+import re
+import uuid
+import json
+import time
+from typing import Optional, Dict, List, Any
+from pathlib import Path
+import contextlib
+from pydantic import BaseModel, Field
+import sqlite3
+from concurrent.futures import ThreadPoolExecutor

-app = FastAPI()
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger("tex2pdf-service")

-@app.post("/tex2pdf")
-async def convert_to_pdf(zip_file: UploadFile = File(...)):
-    if zip_file.filename.endswith('.zip'):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # Unpack the zip file
-            with zipfile.ZipFile(BytesIO(await zip_file.read())) as z:
-                z.extractall(tmpdirname)
+app = FastAPI(title="LaTeX to PDF Conversion Service")
+
+# Configuration
+MAX_UPLOAD_SIZE = int(os.environ.get("MAX_UPLOAD_SIZE", 50 * 1024 * 1024))  # Default: 50 MB
+API_KEY_NAME = os.environ.get("API_KEY_NAME", "X-API-Key")
+ALLOWED_API_KEYS = os.environ.get("ALLOWED_API_KEYS", "").split(",")
+MAX_COMPILATION_TIME = int(os.environ.get("MAX_COMPILATION_TIME", 240))  # Default: 240 seconds
+RATE_LIMIT_WINDOW = int(os.environ.get("RATE_LIMIT_WINDOW", 60))  # Default: 60 seconds
+MAX_REQUESTS_PER_WINDOW = int(os.environ.get("MAX_REQUESTS_PER_WINDOW", 10))  # Default: 10 requests
+JOB_EXPIRY = int(os.environ.get("JOB_EXPIRY", 3600))  # Default: 1 hour
+JOBS_DIR = os.environ.get("JOBS_DIR", "/app/jobs")
+DB_PATH = os.environ.get("DB_PATH", "/app/db/jobs.db")
+API_KEY_REQUIRED = len(ALLOWED_API_KEYS) > 0
+if API_KEY_REQUIRED:
+    API_KEY_REQUIRED = os.environ.get("API_KEY_REQUIRED", "true").lower() in ("true", "1", "yes")
+VERSION=open("VERSION").read().strip()
+
+# Create necessary directories
+os.makedirs(JOBS_DIR, exist_ok=True)
+os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
+
+# Initialize SQLite database
+def init_db():
+    with sqlite3.connect(DB_PATH) as conn:
+        conn.execute('''
+        CREATE TABLE IF NOT EXISTS jobs (
+            id TEXT PRIMARY KEY,
+            status TEXT NOT NULL,
+            created_at REAL NOT NULL,
+            work_dir TEXT,
+            api_key TEXT,
+            options TEXT,
+            error TEXT,
+            progress TEXT,
+            updated_at REAL NOT NULL
+        )
+        ''')
+        # Add index for faster lookups
+        conn.execute('CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status)')
+        conn.execute('CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at)')
+
+# Thread pool for database operations
+executor = ThreadPoolExecutor(max_workers=4)
+
+# In-memory rate limiting
+rate_limits: Dict[str, List[float]] = {}
+
+class ConversionOptions(BaseModel):
+    main_file: str = Field(default="main.tex", description="Main LaTeX file to compile")
+    num_runs: int = Field(default=2, ge=1, le=5, description="Number of compilation runs")
+    use_bibtex: bool = Field(default=False, description="Run BibTeX for bibliography")
+
+def verify_api_key(request: Request):
+    # If API keys are not required, skip validation
+    if not API_KEY_REQUIRED:
+        return "no_auth"
        
-            # Change working directory to tmpdirname
-            os.chdir(tmpdirname)
+    api_key = request.headers.get(API_KEY_NAME)
+    
+    # Check if API key is provided and valid
+    if not api_key:
+        logger.warning("Missing API key in request")
+        raise HTTPException(
+            status_code=401,
+            detail="API key required",
+        )
+    
+    if not ALLOWED_API_KEYS or api_key not in ALLOWED_API_KEYS:
+        logger.warning(f"Unauthorized access attempt with API key: {api_key[:5]}...")
+        raise HTTPException(
+            status_code=401,
+            detail="Invalid API key",
+        )
+    
+    return api_key
+
+def check_rate_limit(request: Request, api_key: str = Depends(verify_api_key)):
+    client_id = api_key or request.client.host
+    current_time = time.time()
+    
+    if client_id not in rate_limits:
+        rate_limits[client_id] = []
+    
+    # Remove timestamps outside the window
+    rate_limits[client_id] = [t for t in rate_limits[client_id] if current_time - t < RATE_LIMIT_WINDOW]
+    
+    if len(rate_limits[client_id]) >= MAX_REQUESTS_PER_WINDOW:
+        logger.warning(f"Rate limit exceeded for {client_id[:5]}...")
+        raise HTTPException(
+            status_code=429,
+            detail=f"Rate limit exceeded. Maximum {MAX_REQUESTS_PER_WINDOW} requests per {RATE_LIMIT_WINDOW} seconds.",
+        )
+    
+    rate_limits[client_id].append(current_time)
+    return client_id
+
+def validate_latex_filename(filename: str) -> bool:
+    """Validate if the filename follows safe LaTeX filename conventions."""
+    return bool(re.match(r'^[a-zA-Z0-9_\-\.]+\.tex$', filename))
+
+def sanitize_zip_archive(zip_file_obj, extract_path):
+    """Extracts zip contents safely, preventing directory traversal attacks."""
+    try:
+        with zipfile.ZipFile(zip_file_obj) as zip_ref:
+            # Log zip contents for debugging
+            logger.info(f"ZIP contents: {zip_ref.namelist()}")
            
-            # Find the main LaTeX file (assuming a convention, e.g., main.tex)
-            main_tex_file = 'main.tex'
-            main_tex_path = os.path.join(tmpdirname, main_tex_file)
-            if not os.path.exists(main_tex_path):
-                raise HTTPException(status_code=400, detail="Main LaTeX file (main.tex) not found in the zip.")
+            # First, check for suspicious paths
+            for file_info in zip_ref.infolist():
+                # Convert to Path for safer path handling
+                file_path = Path(file_info.filename)
+                
+                # Check for absolute paths or directory traversal attempts
+                if file_path.is_absolute() or '..' in file_path.parts:
+                    raise ValueError(f"Suspicious path detected: {file_info.filename}")
+                
+                # Check for extremely large files
+                if file_info.file_size > MAX_UPLOAD_SIZE:
+                    raise ValueError(f"File too large: {file_info.filename}")
+                    
+            # If all files pass validation, extract them
+            for file_info in zip_ref.infolist():
+                # Skip directories
+                if file_info.filename.endswith('/'):
+                    continue
+                    
+                # Create a safe extraction path
+                target_path = Path(extract_path) / file_info.filename
+                
+                # Create parent directories if they don't exist
+                target_path.parent.mkdir(parents=True, exist_ok=True)
+                
+                # Extract the file
+                with zip_ref.open(file_info) as source, open(target_path, 'wb') as target:
+                    shutil.copyfileobj(source, target)
+                    
+            # List extracted files for debugging
+            extracted_files = list(Path(extract_path).glob('**/*'))
+            logger.info(f"Extracted files: {[str(f.relative_to(extract_path)) for f in extracted_files]}")
+            
+        return True
+    except zipfile.BadZipFile:
+        raise ValueError("Invalid ZIP file format")
+    except Exception as e:
+        logger.error(f"Error during ZIP extraction: {str(e)}", exc_info=True)
+        raise ValueError(f"Error extracting ZIP: {str(e)}")

-            # Compile the LaTeX document
-            cmd = ['pdflatex', '-interaction=nonstopmode', '-output-directory', tmpdirname, main_tex_path]
-            process = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
-            try:
-                print(f"Running pdflatex on {main_tex_path}")
-                stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=120)
-                # print(f"pdflatex output: {stdout.decode()}")
-                # print(f"pdflatex errors: {stderr.decode()}")
-            except asyncio.TimeoutError:
-                return {"error": "Conversion timed out."}
+@contextlib.contextmanager
+def working_directory(path):
+    """Changes working directory within the context and reverts back afterwards."""
+    origin = os.getcwd()
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(origin)

+async def run_latex_command(cmd, timeout=MAX_COMPILATION_TIME):
+    """Run a LaTeX-related command with proper timeout and error handling."""
+    logger.info(f"Running command: {' '.join(cmd)}")
+    
+    process = await asyncio.create_subprocess_exec(
+        *cmd, 
+        stdout=asyncio.subprocess.PIPE, 
+        stderr=asyncio.subprocess.PIPE
+    )
+    
+    try:
+        stdout, stderr = await asyncio.wait_for(
+            process.communicate(), 
+            timeout=timeout
+        )
+        
+        stdout_text = stdout.decode('utf-8', errors='replace')
+        stderr_text = stderr.decode('utf-8', errors='replace')
+        
+        logger.info(f"Command returned with code {process.returncode}")
+        if process.returncode != 0:
+            logger.warning(f"Command failed with stderr: {stderr_text[:500]}...")
+            
+        return {
+            "returncode": process.returncode,
+            "stdout": stdout_text,
+            "stderr": stderr_text
+        }
+    except asyncio.TimeoutError:
+        # Try to terminate the process
+        logger.error(f"Command timed out after {timeout} seconds: {' '.join(cmd)}")
+        process.terminate()
+        try:
+            await asyncio.wait_for(process.wait(), timeout=5)
+        except asyncio.TimeoutError:
+            # If it doesn't terminate, force kill
+            process.kill()
+        
+        raise TimeoutError(f"Command timed out after {timeout} seconds: {' '.join(cmd)}")

-            if process.returncode != 0:
-                # Compilation failed
-                return {
-                    "error": "Conversion failed.",
-                    "details": {
-                        "stderr": stderr.decode(),
-                        "stdout": stdout.decode(),
-                    },
-                }
+# Database operations
+def store_job(job_id: str, job_data: Dict[str, Any]):
+    """Store job data in SQLite database"""
+    current_time = time.time()
+    
+    # Extract fields from job_data
+    status = job_data.get("status", "unknown")
+    created_at = job_data.get("created_at", current_time)
+    work_dir = job_data.get("work_dir", "")
+    api_key = job_data.get("api_key", "")
+    options = json.dumps(job_data.get("options", {}))
+    error = job_data.get("error", "")
+    progress = job_data.get("progress", "")
+    
+    with sqlite3.connect(DB_PATH) as conn:
+        conn.execute(
+            '''
+            INSERT OR REPLACE INTO jobs 
+            (id, status, created_at, work_dir, api_key, options, error, progress, updated_at) 
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ''', 
+            (job_id, status, created_at, work_dir, api_key, options, error, progress, current_time)
+        )
+        conn.commit()

-            # Assuming the output PDF has the same base name as the main LaTeX file
-            output_pdf_path = os.path.join(tmpdirname, 'main.pdf')
-            if os.path.exists(output_pdf_path):
-                with open(output_pdf_path, 'rb') as f:
-                    pdf_content = f.read()
-                return StreamingResponse(BytesIO(pdf_content), media_type='application/pdf')
-            else:
-                return {"error": "PDF file not generated."}
-    else:
-        raise HTTPException(status_code=400, detail="Uploaded file is not a zip file.")
+def get_job(job_id: str) -> Optional[Dict[str, Any]]:
+    """Retrieve job data from SQLite database"""
+    with sqlite3.connect(DB_PATH) as conn:
+        conn.row_factory = sqlite3.Row
+        cursor = conn.execute('SELECT * FROM jobs WHERE id = ?', (job_id,))
+        row = cursor.fetchone()
+        
+    if row:
+        job_data = dict(row)
+        # Parse options back to dict
+        if job_data.get('options'):
+            job_data['options'] = json.loads(job_data['options'])
+        return job_data
+    return None
+
+def update_job(job_id: str, updates: Dict[str, Any]):
+    """Update specific fields in the job data"""
+    current_time = time.time()
+    
+    # Start with SET updated_at=?
+    set_values = ["updated_at=?"]
+    params = [current_time]
+    
+    # Add each update field
+    for key, value in updates.items():
+        if key == 'options':
+            value = json.dumps(value)
+        set_values.append(f"{key}=?")
+        params.append(value)
+    
+    # Add job_id as the last parameter
+    params.append(job_id)
+    
+    with sqlite3.connect(DB_PATH) as conn:
+        query = f"UPDATE jobs SET {', '.join(set_values)} WHERE id = ?"
+        conn.execute(query, params)
+        conn.commit()
+
+def get_pdf_path(job_id: str) -> str:
+    """Get the path where the PDF should be stored"""
+    return os.path.join(JOBS_DIR, f"{job_id}.pdf")
+
+def store_pdf(job_id: str, pdf_content: bytes):
+    """Store PDF in the filesystem"""
+    pdf_path = get_pdf_path(job_id)
+    os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
+    
+    with open(pdf_path, 'wb') as f:
+        f.write(pdf_content)
+
+def get_pdf(job_id: str) -> Optional[bytes]:
+    """Retrieve PDF from the filesystem"""
+    pdf_path = get_pdf_path(job_id)
+    if os.path.exists(pdf_path):
+        with open(pdf_path, 'rb') as f:
+            return f.read()
+    return None
+
+async def compile_latex(
+    job_id: str,
+    work_dir: str,
+    main_file: str,
+    num_runs: int,
+    use_bibtex: bool
+):
+    """Compile LaTeX document with proper error handling and multiple runs if needed."""
+    results = []
+    main_tex_path = os.path.join(work_dir, main_file)
+    
+    # Verify the main file exists
+    if not os.path.exists(main_tex_path):
+        logger.error(f"Main LaTeX file not found: {main_tex_path}")
+        update_job(job_id, {
+            "status": "failed",
+            "error": f"Main LaTeX file ({main_file}) not found in the archive."
+        })
+        return False
+    
+    # List directory contents for debugging
+    logger.info(f"Work directory contents: {os.listdir(work_dir)}")
+    
+    try:
+        with working_directory(work_dir):
+            # Run pdflatex multiple times as needed
+            for i in range(num_runs):
+                update_job(job_id, {
+                    "status": "processing",
+                    "progress": f"LaTeX compilation {i+1}/{num_runs}"
+                })
+                
+                # For verbose output to diagnose issues
+                cmd = [
+                    'pdflatex', 
+                    '-interaction=nonstopmode',
+                    '-file-line-error',
+                    main_file
+                ]
+                
+                try:
+                    result = await run_latex_command(cmd)
+                    results.append(result)
+                    
+                    # If compilation failed, stop and provide details
+                    if result["returncode"] != 0:
+                        # Extract relevant error messages
+                        error_lines = []
+                        for line in result["stdout"].split('\n'):
+                            if ":" in line and ("Error" in line or "Fatal" in line):
+                                error_lines.append(line)
+                        
+                        error_message = "LaTeX compilation failed"
+                        if error_lines:
+                            error_message = f"LaTeX errors: {' | '.join(error_lines[:3])}"
+                        
+                        update_job(job_id, {
+                            "status": "failed",
+                            "error": error_message,
+                            "details": json.dumps(result)
+                        })
+                        return False
+                    
+                    # Run bibtex if requested (after the first pdflatex run)
+                    if use_bibtex and i == 0:
+                        update_job(job_id, {
+                            "status": "processing",
+                            "progress": "Running BibTeX"
+                        })
+                        
+                        basename = os.path.splitext(main_file)[0]
+                        bibtex_cmd = ['bibtex', basename]
+                        
+                        bibtex_result = await run_latex_command(bibtex_cmd)
+                        results.append(bibtex_result)
+                
+                except TimeoutError as e:
+                    logger.error(f"Timeout during compilation: {str(e)}")
+                    update_job(job_id, {
+                        "status": "failed",
+                        "error": str(e)
+                    })
+                    return False
+                except Exception as e:
+                    logger.error(f"Unexpected error during compilation: {str(e)}", exc_info=True)
+                    update_job(job_id, {
+                        "status": "failed",
+                        "error": f"Unexpected error: {str(e)}"
+                    })
+                    return False
+            
+            # Check if the PDF was generated
+            pdf_basename = os.path.splitext(main_file)[0]
+            pdf_path = os.path.join(work_dir, f"{pdf_basename}.pdf")
+            
+            if not os.path.exists(pdf_path):
+                logger.error(f"PDF not generated at expected path: {pdf_path}")
+                update_job(job_id, {
+                    "status": "failed",
+                    "error": "PDF file not generated despite successful compilation"
+                })
+                return False
+            
+            # Store the PDF in the filesystem
+            with open(pdf_path, 'rb') as f:
+                pdf_content = f.read()
+                store_pdf(job_id, pdf_content)
+            
+            # Update job status
+            update_job(job_id, {
+                "status": "completed",
+            })
+            return True
+            
+    except Exception as e:
+        logger.error(f"Exception in compile_latex: {str(e)}", exc_info=True)
+        update_job(job_id, {
+            "status": "failed",
+            "error": f"Unexpected error: {str(e)}"
+        })
+        return False
+
+# Clean up old jobs (runs in background)
+async def cleanup_old_jobs():
+    """Clean up old jobs and their resources"""
+    while True:
+        try:
+            current_time = time.time()
+            expiry_time = current_time - JOB_EXPIRY
+            
+            # Get expired jobs
+            with sqlite3.connect(DB_PATH) as conn:
+                conn.row_factory = sqlite3.Row
+                cursor = conn.execute('SELECT id, work_dir FROM jobs WHERE created_at < ?', (expiry_time,))
+                expired_jobs = cursor.fetchall()
+            
+            for job in expired_jobs:
+                job_id = job['id']
+                work_dir = job['work_dir']
+                
+                # Clean up PDF if it exists
+                pdf_path = get_pdf_path(job_id)
+                if os.path.exists(pdf_path):
+                    os.remove(pdf_path)
+                
+                # Clean up work directory if it exists
+                if work_dir and os.path.exists(work_dir):
+                    shutil.rmtree(work_dir, ignore_errors=True)
+                
+                # Remove job from database
+                with sqlite3.connect(DB_PATH) as conn:
+                    conn.execute('DELETE FROM jobs WHERE id = ?', (job_id,))
+                    conn.commit()
+                
+                logger.info(f"Cleaned up expired job {job_id}")
+                
+        except Exception as e:
+            logger.error(f"Error in cleanup task: {str(e)}", exc_info=True)
+        
+        # Run cleanup every 15 minutes
+        await asyncio.sleep(900)
+
+@app.post("/tex2pdf", 
+          dependencies=[Depends(check_rate_limit)],
+          summary="Convert LaTeX files to PDF",
+          response_description="Returns job ID for status checking")
+async def convert_to_pdf(
+    background_tasks: BackgroundTasks,
+    request: Request,
+    zip_file: UploadFile = File(...),
+    options: Optional[ConversionOptions] = None
+):
+    """
+    Takes a zip file containing LaTeX files and compiles them into a PDF.
+    
+    - The zip file must contain all necessary files for compilation
+    - By default, assumes main.tex is the main file unless specified otherwise
+    - Returns a job ID that can be used to check status and retrieve the PDF
+    """
+    api_key = verify_api_key(request)
+    start_time = time.time()
+    job_id = str(uuid.uuid4())
+    
+    if options is None:
+        options = ConversionOptions()
+    
+    logger.info(f"Starting conversion job {job_id}")
+    
+    # Validate input
+    if not zip_file.filename.endswith('.zip'):
+        logger.warning(f"Job {job_id}: Invalid file format: {zip_file.filename}")
+        raise HTTPException(
+            status_code=400, 
+            detail="Uploaded file must be a zip archive."
+        )
+    
+    if not validate_latex_filename(options.main_file):
+        logger.warning(f"Job {job_id}: Invalid main file name: {options.main_file}")
+        raise HTTPException(
+            status_code=400, 
+            detail="Main file name must be a valid LaTeX filename (e.g., main.tex)"
+        )
+    
+    # Create the job record
+    job_data = {
+        "id": job_id,
+        "status": "uploading",
+        "created_at": start_time,
+        "options": options.dict(),
+        "api_key": api_key,
+    }
+    store_job(job_id, job_data)
+    
+    try:
+        # Create a temporary directory for this job
+        work_dir = tempfile.mkdtemp(prefix=f"tex2pdf_{job_id}_")
+        update_job(job_id, {
+            "status": "extracting",
+            "work_dir": work_dir
+        })
+        
+        # Read zip file to memory
+        zip_content = await zip_file.read()
+        if len(zip_content) > MAX_UPLOAD_SIZE:
+            logger.warning(f"Job {job_id}: File too large: {len(zip_content)} bytes")
+            update_job(job_id, {
+                "status": "failed",
+                "error": f"File too large. Maximum size: {MAX_UPLOAD_SIZE/1024/1024} MB"
+            })
+            return {
+                "job_id": job_id, 
+                "status": "failed", 
+                "message": "File too large"
+            }
+        
+        # Extract zip files safely
+        try:
+            sanitize_zip_archive(BytesIO(zip_content), work_dir)
+            update_job(job_id, {"status": "queued"})
+        except ValueError as e:
+            logger.warning(f"Job {job_id}: Zip extraction failed: {str(e)}")
+            update_job(job_id, {
+                "status": "failed",
+                "error": f"Zip extraction failed: {str(e)}"
+            })
+            return {
+                "job_id": job_id, 
+                "status": "failed", 
+                "message": str(e)
+            }
+        
+        # Start compilation in background
+        background_tasks.add_task(
+            compile_latex,
+            job_id,
+            work_dir,
+            options.main_file,
+            options.num_runs,
+            options.use_bibtex
+        )
+        
+        return {
+            "job_id": job_id,
+            "status": "processing",
+            "message": "Conversion job started"
+        }
+        
+    except Exception as e:
+        logger.error(f"Job {job_id}: Unexpected error: {str(e)}", exc_info=True)
+        update_job(job_id, {
+            "status": "failed",
+            "error": f"Unexpected error: {str(e)}"
+        })
+        return {
+            "job_id": job_id, 
+            "status": "failed", 
+            "message": "Server error"
+        }
+
+@app.get("/tex2pdf/status/{job_id}", 
+         dependencies=[Depends(verify_api_key)],
+         summary="Check the status of a conversion job")
+async def check_job_status(job_id: str):
+    """Check the status of a previously submitted conversion job."""
+    job = get_job(job_id)
+    if not job:
+        raise HTTPException(
+            status_code=404,
+            detail="Job not found"
+        )
+    
+    # Clean sensitive or internal information
+    response = {
+        "job_id": job_id,
+        "status": job["status"],
+        "created_at": job["created_at"],
+    }
+    
+    # Add error details if failed
+    if job["status"] == "failed" and "error" in job:
+        response["error"] = job["error"]
+    
+    # Add progress info if processing
+    if job["status"] == "processing" and "progress" in job:
+        response["progress"] = job["progress"]
+    
+    return response
+
+@app.get("/tex2pdf/download/{job_id}", 
+         dependencies=[Depends(verify_api_key)],
+         summary="Download the generated PDF")
+async def download_pdf(job_id: str):
+    """Download the PDF generated by a completed conversion job."""
+    job = get_job(job_id)
+    if not job:
+        raise HTTPException(
+            status_code=404,
+            detail="Job not found"
+        )
+    
+    if job["status"] != "completed":
+        raise HTTPException(
+            status_code=400,
+            detail=f"PDF not ready. Current status: {job['status']}"
+        )
+    
+    try:
+        # Option 1: Get PDF from memory and stream it
+        # pdf_content = get_pdf(job_id)
+        # if not pdf_content:
+        #     raise HTTPException(
+        #         status_code=404,
+        #         detail="PDF file not found in storage"
+        #     )
+        # 
+        # # Generate a filename based on the job ID
+        # filename = f"document_{job_id[-6:]}.pdf"
+        # 
+        # return StreamingResponse(
+        #     BytesIO(pdf_content), 
+        #     media_type='application/pdf',
+        #     headers={"Content-Disposition": f"attachment; filename={filename}"}
+        # )
+        
+        # Option 2: Use FileResponse for more efficient file serving
+        pdf_path = get_pdf_path(job_id)
+        if not os.path.exists(pdf_path):
+            raise HTTPException(
+                status_code=404,
+                detail="PDF file not found in storage"
+            )
+        
+        filename = f"document_{job_id[-6:]}.pdf"
+        
+        return FileResponse(
+            pdf_path,
+            media_type='application/pdf',
+            filename=filename
+        )
+    except Exception as e:
+        logger.error(f"Error delivering PDF for job {job_id}: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail="Error retrieving PDF file"
+        )
+
+@app.get("/health", summary="Health check endpoint")
+async def health_check():
+    """Simple health check endpoint to verify the API is running."""
+    try:
+        # Check database connection
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.execute("SELECT 1")
+            cursor.fetchone()
+        db_status = "connected"
+    except Exception as e:
+        db_status = f"error: {str(e)}"
+    
+    return {
+        "status": "healthy", 
+        "version": VERSION,
+        "database": db_status,
+        "storage": os.path.exists(JOBS_DIR) and os.access(JOBS_DIR, os.W_OK)
+    }
+
+@app.on_event("startup")
+async def startup_event():
+    logger.info("Service starting up")
+    # Initialize the database
+    init_db()
+    # Start background cleanup task
+    asyncio.create_task(cleanup_old_jobs())
+
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Clean up on shutdown"""
+    logger.info("Service shutting down")
+    executor.shutdown(wait=False)

--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+fastapi==0.110.0
+uvicorn==0.27.1
+pydantic==2.5.3
+python-multipart==0.0.6
+aiofiles==23.2.1