Version 2.3.2 (Complete refactor)

This commit is contained in:
Reza Behzadan 2025-03-08 06:29:14 -05:00
parent dcb57e5a2b
commit 9963a25e4a
7 changed files with 985 additions and 89 deletions

10
.dockerignore Normal file
View File

@ -0,0 +1,10 @@
.gitignore
.dockerignore
.git
.archive
.env
.env.*
logs
Dockerfile
README.md
docker-compose.yaml

View File

@ -1,14 +1,72 @@
FROM archlinux:base FROM python:3.11-slim AS builder
# Update and install necessary packages
RUN pacman -Sy --noconfirm curl reflector && \
reflector --latest 5 --sort rate --save /etc/pacman.d/mirrorlist && \
sed -i '/\[options\]/a XferCommand = /usr/bin/curl -C - --fail --retry 3 --retry-delay 3 -o %o %u' /etc/pacman.conf && \
pacman -Syu --noconfirm --needed texlive-basic texlive-bibtexextra texlive-bin texlive-binextra texlive-context texlive-fontsrecommended texlive-fontsextra texlive-fontutils texlive-formatsextra texlive-langenglish texlive-langeuropean texlive-langfrench texlive-langgerman texlive-latex texlive-latexextra texlive-latexrecommended texlive-luatex texlive-mathscience texlive-metapost texlive-music texlive-pictures texlive-plaingeneric texlive-pstricks texlive-publishers && \
pacman -Syu --noconfirm --needed python-fastapi uvicorn python-python-multipart && \
yes | pacman -Scc
# Set working directory
WORKDIR /app WORKDIR /app
COPY main.py .
ENTRYPOINT ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] # Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip wheel --no-cache-dir --wheel-dir /app/wheels -r requirements.txt
# Create final image
FROM python:3.11-slim AS final
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app \
PORT=8000 \
MAX_WORKERS=4 \
TZ=UTC \
JOBS_DIR=/data/jobs \
DB_PATH=/data/db/jobs.db
# Create a non-root user
RUN groupadd -r appuser && useradd -r -g appuser appuser
# Install LaTeX and required dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
texlive-full \
tini \
sqlite3 \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Create persistent directories
RUN mkdir -p /data/jobs /data/db /app/temp && \
chown -R appuser:appuser /app /data
# Copy Python wheels from builder stage
COPY --from=builder /app/wheels /wheels
# Install Python dependencies
RUN pip install --no-cache-dir /wheels/* && rm -rf /wheels
# Copy application code
COPY . /app/
# Create entrypoint script that properly handles environment variables
RUN echo '#!/bin/sh\n\
exec uvicorn main:app --host 0.0.0.0 --port $PORT --workers $MAX_WORKERS --log-level info\n\
' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
# Switch to non-root user
USER appuser
# Expose the service port
EXPOSE 8000
# Use tini as init
ENTRYPOINT ["/usr/bin/tini", "--"]
# Run the application with proper signal handling
CMD ["/app/entrypoint.sh"]

219
README.md
View File

@ -1,58 +1,213 @@
# LaTeX-to-PDF Conversion Service # LaTeX to PDF Conversion Service
This service provides an API endpoint to convert LaTeX documents into PDF format. It supports `.zip` file uploads containing the LaTeX source file (`main.tex`) and any associated files (e.g., images or additional `.tex` files). A high-performance, secure REST API for converting LaTeX documents to PDF format.
## Getting Started ## Features
### Building the Docker Image - **Simple API**: Upload a ZIP file containing LaTeX documents and get a PDF back
- **Secure Processing**: Comprehensive security measures including input validation and sanitization
- **Multiple Workers**: Designed for concurrency with shared file system and SQLite database
- **Robust Error Handling**: Detailed error messages with LaTeX compilation logs
- **Automatic Cleanup**: Background process removes expired PDFs and temporary files
- **Configurable Options**: Multiple compilation runs, BibTeX support, custom main file name
- **API Key Authentication**: Optional security layer with configurable API keys
- **Rate Limiting**: Protection against API abuse
- **Resource Control**: Limits on file sizes and compilation time
- **Docker Ready**: Ready-to-use Docker and Docker Compose configurations
To build the Docker image for the conversion service, navigate to the project directory and run: ## Quick Start
The easiest way to run the service is with Docker Compose:
```bash ```bash
docker build -t rbehzadan/tex2pdf . # Clone the repository
git clone https://github.com/yourusername/tex2pdf.git
cd tex2pdf
# Start the service
docker-compose up -d
``` ```
### Running the Service The service will be available at `http://localhost:8000`.
After building the image, you can start the service with the following command: ## API Usage
### Convert LaTeX to PDF
```bash ```bash
docker run -d -p 8000:8000 rbehzadan/tex2pdf curl -X POST \
-H "X-API-Key: 1234" \
-F "zip_file=@my_latex_files.zip" \
http://localhost:8000/tex2pdf
``` ```
This command runs the Docker container in the background (`-d`) and maps port `8000` of the container to port `8000` on the host, making the service accessible at `http://localhost:8000`. Response:
```json
{
"job_id": "28f5bf9b-587f-4f3c-a3de-4d737d9736ce",
"status": "processing",
"message": "Conversion job started"
}
```
## API Endpoint ### Check Job Status
The service exposes a single POST endpoint at `/tex2pdf` for converting LaTeX to PDF.
### Uploading a `.zip` File
The `.zip` file should contain a `main.tex` file and can include additional resources such as images or other `.tex` files used by `main.tex`.
## Manual Testing
### Testing with `curl`
To test the conversion service with `curl`, use the following command:
```bash ```bash
curl -X POST -F "zip_file=@path/to/your/file.zip" http://localhost:8000/tex2pdf -o output.pdf curl -X GET \
-H "X-API-Key: 1234" \
http://localhost:8000/tex2pdf/status/28f5bf9b-587f-4f3c-a3de-4d737d9736ce
``` ```
Replace `path/to/your/file.zip` with the actual path to your `.zip` file. The resulting PDF will be saved as `output.pdf` in the current directory. Response:
```json
{
"job_id": "28f5bf9b-587f-4f3c-a3de-4d737d9736ce",
"status": "completed",
"created_at": 1741424390.6039968
}
```
### Testing with HTTPie ### Download PDF
HTTPie offers a more user-friendly way to test the service. Use the following command for testing:
```bash ```bash
http -f POST http://localhost:8000/tex2pdf zip_file@path/to/your/file.zip > output.pdf curl -X GET \
-H "X-API-Key: 1234" \
-o output.pdf \
http://localhost:8000/tex2pdf/download/28f5bf9b-587f-4f3c-a3de-4d737d9736ce
``` ```
As with `curl`, replace `path/to/your/file.zip` with the path to your `.zip` file. The output will be redirected to `output.pdf` in the current directory. ### Health Check
## Troubleshooting ```bash
curl http://localhost:8000/health
```
If you encounter any issues with the conversion process, ensure that your `.zip` file is structured correctly, with a `main.tex` file at the root. For more detailed error information, consult the service logs. Response:
```json
{
"status": "healthy",
"version": "1.0.0",
"database": "connected",
"storage": true
}
```
## Advanced Usage
### Compilation Options
You can customize the LaTeX compilation process:
```bash
curl -X POST \
-H "X-API-Key: 1234" \
-F "zip_file=@my_latex_files.zip" \
-F "options={\"main_file\": \"document.tex\", \"num_runs\": 3, \"use_bibtex\": true}" \
http://localhost:8000/tex2pdf
```
Options:
- `main_file`: Main LaTeX file to compile (default: `main.tex`)
- `num_runs`: Number of compilation runs (default: 2)
- `use_bibtex`: Run BibTeX for bibliography processing (default: false)
## ZIP File Requirements
- The ZIP file must contain all necessary files for LaTeX compilation
- By default, the service looks for `main.tex` as the main file
- All referenced files (images, styles, etc.) should be included
- Paths in LaTeX files should be relative and match the ZIP structure
## Configuration
The service can be configured via environment variables in the docker-compose.yml file:
| Variable | Description | Default |
|----------|-------------|---------|
| `ALLOWED_API_KEYS` | Comma-separated list of valid API keys | "" (empty = no auth) |
| `API_KEY_REQUIRED` | Enable/disable API key validation | "true" |
| `MAX_WORKERS` | Number of uvicorn workers | 2 |
| `MAX_UPLOAD_SIZE` | Maximum file upload size in bytes | 52428800 (50MB) |
| `MAX_COMPILATION_TIME` | Maximum LaTeX compilation time in seconds | 240 |
| `RATE_LIMIT_WINDOW` | Rate limiting window in seconds | 60 |
| `MAX_REQUESTS_PER_WINDOW` | Maximum requests per rate limit window | 10 |
| `JOB_EXPIRY` | Job expiry time in seconds | 3600 (1 hour) |
| `JOBS_DIR` | Directory for storing PDF files | "/data/jobs" |
| `DB_PATH` | Path to SQLite database | "/data/db/jobs.db" |
## Deployment
### System Requirements
- Docker and Docker Compose
- For running without Docker:
- Python 3.10+
- LaTeX distribution (texlive)
- SQLite3
### Production Deployment Considerations
For production deployments, consider:
1. **Configure a reverse proxy** (like Nginx) with HTTPS
2. **Adjust resource limits** based on your workload
3. **Set strong API keys** and restrict access
4. **Mount persistent volumes** for job data
5. **Monitor disk usage** and adjust `JOB_EXPIRY` accordingly
6. **Set up logging** to a centralized logging service
## Architecture
The service uses a stateless design with background processing:
1. **FastAPI Application**: Handles HTTP requests and responses
2. **SQLite Database**: Stores job metadata and status
3. **File System**: Stores generated PDFs and temporary files
4. **Background Tasks**: Process LaTeX compilation asynchronously
## Development
### Local Development Setup
```bash
# Clone the repository
git clone https://github.com/yourusername/tex2pdf.git
cd tex2pdf
# Create a virtual environment
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install dependencies
pip install -r requirements.txt
# Run the service
uvicorn main:app --reload --host 0.0.0.0 --port 8000
```
### Running Tests
```bash
pytest tests/
```
## License
[MIT License](LICENSE)
## Contributing
Contributions are welcome! Please feel free to submit a Pull Request.
## Security Considerations
While this service implements several security measures:
- API key authentication
- Input validation
- Rate limiting
- Safe ZIP extraction
- Process isolation
Be aware that allowing users to run LaTeX compilation on your server carries inherent risks. Always deploy behind a secure gateway in production environments.

1
VERSION Normal file
View File

@ -0,0 +1 @@
2.3.3

21
docker-compose.yaml Normal file
View File

@ -0,0 +1,21 @@
services:
app:
image: rbehzadan/tex2pdf
container_name: tex2pdf
ports:
- "8000:8000"
environment:
- ALLOWED_API_KEYS=1234,5678,abcd # Comma-separated list of allowed API keys
- API_KEY_REQUIRED=true # Set to "false" to disable API key validation
- MAX_WORKERS=4 # Number of uvicorn workers
- MAX_UPLOAD_SIZE=52428800 # 50MB in bytes
- MAX_COMPILATION_TIME=240 # Maximum LaTeX compilation time in seconds
- RATE_LIMIT_WINDOW=60 # Rate limiting window in seconds
- MAX_REQUESTS_PER_WINDOW=10 # Maximum requests per rate limit window
- JOB_EXPIRY=3600 # Job expiry time in seconds (1 hour)
volumes:
- pdf_data:/data
restart: unless-stopped
volumes:
pdf_data:

738
main.py
View File

@ -1,60 +1,706 @@
from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi import FastAPI, File, UploadFile, HTTPException, Depends, BackgroundTasks, Request
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse, FileResponse
from io import BytesIO from io import BytesIO
import asyncio import asyncio
import tempfile import tempfile
import zipfile import zipfile
import os import os
import logging
import shutil
import re
import uuid
import json
import time
from typing import Optional, Dict, List, Any
from pathlib import Path
import contextlib
from pydantic import BaseModel, Field
import sqlite3
from concurrent.futures import ThreadPoolExecutor
app = FastAPI() # Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("tex2pdf-service")
@app.post("/tex2pdf") app = FastAPI(title="LaTeX to PDF Conversion Service")
async def convert_to_pdf(zip_file: UploadFile = File(...)):
if zip_file.filename.endswith('.zip'): # Configuration
with tempfile.TemporaryDirectory() as tmpdirname: MAX_UPLOAD_SIZE = int(os.environ.get("MAX_UPLOAD_SIZE", 50 * 1024 * 1024)) # Default: 50 MB
# Unpack the zip file API_KEY_NAME = os.environ.get("API_KEY_NAME", "X-API-Key")
with zipfile.ZipFile(BytesIO(await zip_file.read())) as z: ALLOWED_API_KEYS = os.environ.get("ALLOWED_API_KEYS", "").split(",")
z.extractall(tmpdirname) MAX_COMPILATION_TIME = int(os.environ.get("MAX_COMPILATION_TIME", 240)) # Default: 240 seconds
RATE_LIMIT_WINDOW = int(os.environ.get("RATE_LIMIT_WINDOW", 60)) # Default: 60 seconds
MAX_REQUESTS_PER_WINDOW = int(os.environ.get("MAX_REQUESTS_PER_WINDOW", 10)) # Default: 10 requests
JOB_EXPIRY = int(os.environ.get("JOB_EXPIRY", 3600)) # Default: 1 hour
JOBS_DIR = os.environ.get("JOBS_DIR", "/app/jobs")
DB_PATH = os.environ.get("DB_PATH", "/app/db/jobs.db")
API_KEY_REQUIRED = len(ALLOWED_API_KEYS) > 0
if API_KEY_REQUIRED:
API_KEY_REQUIRED = os.environ.get("API_KEY_REQUIRED", "true").lower() in ("true", "1", "yes")
VERSION=open("VERSION").read().strip()
# Create necessary directories
os.makedirs(JOBS_DIR, exist_ok=True)
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
# Initialize SQLite database
def init_db():
with sqlite3.connect(DB_PATH) as conn:
conn.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id TEXT PRIMARY KEY,
status TEXT NOT NULL,
created_at REAL NOT NULL,
work_dir TEXT,
api_key TEXT,
options TEXT,
error TEXT,
progress TEXT,
updated_at REAL NOT NULL
)
''')
# Add index for faster lookups
conn.execute('CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at)')
# Thread pool for database operations
executor = ThreadPoolExecutor(max_workers=4)
# In-memory rate limiting
rate_limits: Dict[str, List[float]] = {}
class ConversionOptions(BaseModel):
main_file: str = Field(default="main.tex", description="Main LaTeX file to compile")
num_runs: int = Field(default=2, ge=1, le=5, description="Number of compilation runs")
use_bibtex: bool = Field(default=False, description="Run BibTeX for bibliography")
def verify_api_key(request: Request):
# If API keys are not required, skip validation
if not API_KEY_REQUIRED:
return "no_auth"
# Change working directory to tmpdirname api_key = request.headers.get(API_KEY_NAME)
os.chdir(tmpdirname)
# Check if API key is provided and valid
if not api_key:
logger.warning("Missing API key in request")
raise HTTPException(
status_code=401,
detail="API key required",
)
if not ALLOWED_API_KEYS or api_key not in ALLOWED_API_KEYS:
logger.warning(f"Unauthorized access attempt with API key: {api_key[:5]}...")
raise HTTPException(
status_code=401,
detail="Invalid API key",
)
return api_key
def check_rate_limit(request: Request, api_key: str = Depends(verify_api_key)):
client_id = api_key or request.client.host
current_time = time.time()
if client_id not in rate_limits:
rate_limits[client_id] = []
# Remove timestamps outside the window
rate_limits[client_id] = [t for t in rate_limits[client_id] if current_time - t < RATE_LIMIT_WINDOW]
if len(rate_limits[client_id]) >= MAX_REQUESTS_PER_WINDOW:
logger.warning(f"Rate limit exceeded for {client_id[:5]}...")
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded. Maximum {MAX_REQUESTS_PER_WINDOW} requests per {RATE_LIMIT_WINDOW} seconds.",
)
rate_limits[client_id].append(current_time)
return client_id
def validate_latex_filename(filename: str) -> bool:
"""Validate if the filename follows safe LaTeX filename conventions."""
return bool(re.match(r'^[a-zA-Z0-9_\-\.]+\.tex$', filename))
def sanitize_zip_archive(zip_file_obj, extract_path):
"""Extracts zip contents safely, preventing directory traversal attacks."""
try:
with zipfile.ZipFile(zip_file_obj) as zip_ref:
# Log zip contents for debugging
logger.info(f"ZIP contents: {zip_ref.namelist()}")
# Find the main LaTeX file (assuming a convention, e.g., main.tex) # First, check for suspicious paths
main_tex_file = 'main.tex' for file_info in zip_ref.infolist():
main_tex_path = os.path.join(tmpdirname, main_tex_file) # Convert to Path for safer path handling
if not os.path.exists(main_tex_path): file_path = Path(file_info.filename)
raise HTTPException(status_code=400, detail="Main LaTeX file (main.tex) not found in the zip.")
# Check for absolute paths or directory traversal attempts
if file_path.is_absolute() or '..' in file_path.parts:
raise ValueError(f"Suspicious path detected: {file_info.filename}")
# Check for extremely large files
if file_info.file_size > MAX_UPLOAD_SIZE:
raise ValueError(f"File too large: {file_info.filename}")
# If all files pass validation, extract them
for file_info in zip_ref.infolist():
# Skip directories
if file_info.filename.endswith('/'):
continue
# Create a safe extraction path
target_path = Path(extract_path) / file_info.filename
# Create parent directories if they don't exist
target_path.parent.mkdir(parents=True, exist_ok=True)
# Extract the file
with zip_ref.open(file_info) as source, open(target_path, 'wb') as target:
shutil.copyfileobj(source, target)
# List extracted files for debugging
extracted_files = list(Path(extract_path).glob('**/*'))
logger.info(f"Extracted files: {[str(f.relative_to(extract_path)) for f in extracted_files]}")
return True
except zipfile.BadZipFile:
raise ValueError("Invalid ZIP file format")
except Exception as e:
logger.error(f"Error during ZIP extraction: {str(e)}", exc_info=True)
raise ValueError(f"Error extracting ZIP: {str(e)}")
# Compile the LaTeX document @contextlib.contextmanager
cmd = ['pdflatex', '-interaction=nonstopmode', '-output-directory', tmpdirname, main_tex_path] def working_directory(path):
process = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) """Changes working directory within the context and reverts back afterwards."""
try: origin = os.getcwd()
print(f"Running pdflatex on {main_tex_path}") try:
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=120) os.chdir(path)
# print(f"pdflatex output: {stdout.decode()}") yield
# print(f"pdflatex errors: {stderr.decode()}") finally:
except asyncio.TimeoutError: os.chdir(origin)
return {"error": "Conversion timed out."}
async def run_latex_command(cmd, timeout=MAX_COMPILATION_TIME):
"""Run a LaTeX-related command with proper timeout and error handling."""
logger.info(f"Running command: {' '.join(cmd)}")
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
try:
stdout, stderr = await asyncio.wait_for(
process.communicate(),
timeout=timeout
)
stdout_text = stdout.decode('utf-8', errors='replace')
stderr_text = stderr.decode('utf-8', errors='replace')
logger.info(f"Command returned with code {process.returncode}")
if process.returncode != 0:
logger.warning(f"Command failed with stderr: {stderr_text[:500]}...")
return {
"returncode": process.returncode,
"stdout": stdout_text,
"stderr": stderr_text
}
except asyncio.TimeoutError:
# Try to terminate the process
logger.error(f"Command timed out after {timeout} seconds: {' '.join(cmd)}")
process.terminate()
try:
await asyncio.wait_for(process.wait(), timeout=5)
except asyncio.TimeoutError:
# If it doesn't terminate, force kill
process.kill()
raise TimeoutError(f"Command timed out after {timeout} seconds: {' '.join(cmd)}")
if process.returncode != 0: # Database operations
# Compilation failed def store_job(job_id: str, job_data: Dict[str, Any]):
return { """Store job data in SQLite database"""
"error": "Conversion failed.", current_time = time.time()
"details": {
"stderr": stderr.decode(), # Extract fields from job_data
"stdout": stdout.decode(), status = job_data.get("status", "unknown")
}, created_at = job_data.get("created_at", current_time)
} work_dir = job_data.get("work_dir", "")
api_key = job_data.get("api_key", "")
options = json.dumps(job_data.get("options", {}))
error = job_data.get("error", "")
progress = job_data.get("progress", "")
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
'''
INSERT OR REPLACE INTO jobs
(id, status, created_at, work_dir, api_key, options, error, progress, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
''',
(job_id, status, created_at, work_dir, api_key, options, error, progress, current_time)
)
conn.commit()
# Assuming the output PDF has the same base name as the main LaTeX file def get_job(job_id: str) -> Optional[Dict[str, Any]]:
output_pdf_path = os.path.join(tmpdirname, 'main.pdf') """Retrieve job data from SQLite database"""
if os.path.exists(output_pdf_path): with sqlite3.connect(DB_PATH) as conn:
with open(output_pdf_path, 'rb') as f: conn.row_factory = sqlite3.Row
pdf_content = f.read() cursor = conn.execute('SELECT * FROM jobs WHERE id = ?', (job_id,))
return StreamingResponse(BytesIO(pdf_content), media_type='application/pdf') row = cursor.fetchone()
else:
return {"error": "PDF file not generated."} if row:
else: job_data = dict(row)
raise HTTPException(status_code=400, detail="Uploaded file is not a zip file.") # Parse options back to dict
if job_data.get('options'):
job_data['options'] = json.loads(job_data['options'])
return job_data
return None
def update_job(job_id: str, updates: Dict[str, Any]):
"""Update specific fields in the job data"""
current_time = time.time()
# Start with SET updated_at=?
set_values = ["updated_at=?"]
params = [current_time]
# Add each update field
for key, value in updates.items():
if key == 'options':
value = json.dumps(value)
set_values.append(f"{key}=?")
params.append(value)
# Add job_id as the last parameter
params.append(job_id)
with sqlite3.connect(DB_PATH) as conn:
query = f"UPDATE jobs SET {', '.join(set_values)} WHERE id = ?"
conn.execute(query, params)
conn.commit()
def get_pdf_path(job_id: str) -> str:
"""Get the path where the PDF should be stored"""
return os.path.join(JOBS_DIR, f"{job_id}.pdf")
def store_pdf(job_id: str, pdf_content: bytes):
"""Store PDF in the filesystem"""
pdf_path = get_pdf_path(job_id)
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
with open(pdf_path, 'wb') as f:
f.write(pdf_content)
def get_pdf(job_id: str) -> Optional[bytes]:
"""Retrieve PDF from the filesystem"""
pdf_path = get_pdf_path(job_id)
if os.path.exists(pdf_path):
with open(pdf_path, 'rb') as f:
return f.read()
return None
async def compile_latex(
job_id: str,
work_dir: str,
main_file: str,
num_runs: int,
use_bibtex: bool
):
"""Compile LaTeX document with proper error handling and multiple runs if needed."""
results = []
main_tex_path = os.path.join(work_dir, main_file)
# Verify the main file exists
if not os.path.exists(main_tex_path):
logger.error(f"Main LaTeX file not found: {main_tex_path}")
update_job(job_id, {
"status": "failed",
"error": f"Main LaTeX file ({main_file}) not found in the archive."
})
return False
# List directory contents for debugging
logger.info(f"Work directory contents: {os.listdir(work_dir)}")
try:
with working_directory(work_dir):
# Run pdflatex multiple times as needed
for i in range(num_runs):
update_job(job_id, {
"status": "processing",
"progress": f"LaTeX compilation {i+1}/{num_runs}"
})
# For verbose output to diagnose issues
cmd = [
'pdflatex',
'-interaction=nonstopmode',
'-file-line-error',
main_file
]
try:
result = await run_latex_command(cmd)
results.append(result)
# If compilation failed, stop and provide details
if result["returncode"] != 0:
# Extract relevant error messages
error_lines = []
for line in result["stdout"].split('\n'):
if ":" in line and ("Error" in line or "Fatal" in line):
error_lines.append(line)
error_message = "LaTeX compilation failed"
if error_lines:
error_message = f"LaTeX errors: {' | '.join(error_lines[:3])}"
update_job(job_id, {
"status": "failed",
"error": error_message,
"details": json.dumps(result)
})
return False
# Run bibtex if requested (after the first pdflatex run)
if use_bibtex and i == 0:
update_job(job_id, {
"status": "processing",
"progress": "Running BibTeX"
})
basename = os.path.splitext(main_file)[0]
bibtex_cmd = ['bibtex', basename]
bibtex_result = await run_latex_command(bibtex_cmd)
results.append(bibtex_result)
except TimeoutError as e:
logger.error(f"Timeout during compilation: {str(e)}")
update_job(job_id, {
"status": "failed",
"error": str(e)
})
return False
except Exception as e:
logger.error(f"Unexpected error during compilation: {str(e)}", exc_info=True)
update_job(job_id, {
"status": "failed",
"error": f"Unexpected error: {str(e)}"
})
return False
# Check if the PDF was generated
pdf_basename = os.path.splitext(main_file)[0]
pdf_path = os.path.join(work_dir, f"{pdf_basename}.pdf")
if not os.path.exists(pdf_path):
logger.error(f"PDF not generated at expected path: {pdf_path}")
update_job(job_id, {
"status": "failed",
"error": "PDF file not generated despite successful compilation"
})
return False
# Store the PDF in the filesystem
with open(pdf_path, 'rb') as f:
pdf_content = f.read()
store_pdf(job_id, pdf_content)
# Update job status
update_job(job_id, {
"status": "completed",
})
return True
except Exception as e:
logger.error(f"Exception in compile_latex: {str(e)}", exc_info=True)
update_job(job_id, {
"status": "failed",
"error": f"Unexpected error: {str(e)}"
})
return False
# Clean up old jobs (runs in background)
async def cleanup_old_jobs():
"""Clean up old jobs and their resources"""
while True:
try:
current_time = time.time()
expiry_time = current_time - JOB_EXPIRY
# Get expired jobs
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute('SELECT id, work_dir FROM jobs WHERE created_at < ?', (expiry_time,))
expired_jobs = cursor.fetchall()
for job in expired_jobs:
job_id = job['id']
work_dir = job['work_dir']
# Clean up PDF if it exists
pdf_path = get_pdf_path(job_id)
if os.path.exists(pdf_path):
os.remove(pdf_path)
# Clean up work directory if it exists
if work_dir and os.path.exists(work_dir):
shutil.rmtree(work_dir, ignore_errors=True)
# Remove job from database
with sqlite3.connect(DB_PATH) as conn:
conn.execute('DELETE FROM jobs WHERE id = ?', (job_id,))
conn.commit()
logger.info(f"Cleaned up expired job {job_id}")
except Exception as e:
logger.error(f"Error in cleanup task: {str(e)}", exc_info=True)
# Run cleanup every 15 minutes
await asyncio.sleep(900)
@app.post("/tex2pdf",
dependencies=[Depends(check_rate_limit)],
summary="Convert LaTeX files to PDF",
response_description="Returns job ID for status checking")
async def convert_to_pdf(
background_tasks: BackgroundTasks,
request: Request,
zip_file: UploadFile = File(...),
options: Optional[ConversionOptions] = None
):
"""
Takes a zip file containing LaTeX files and compiles them into a PDF.
- The zip file must contain all necessary files for compilation
- By default, assumes main.tex is the main file unless specified otherwise
- Returns a job ID that can be used to check status and retrieve the PDF
"""
api_key = verify_api_key(request)
start_time = time.time()
job_id = str(uuid.uuid4())
if options is None:
options = ConversionOptions()
logger.info(f"Starting conversion job {job_id}")
# Validate input
if not zip_file.filename.endswith('.zip'):
logger.warning(f"Job {job_id}: Invalid file format: {zip_file.filename}")
raise HTTPException(
status_code=400,
detail="Uploaded file must be a zip archive."
)
if not validate_latex_filename(options.main_file):
logger.warning(f"Job {job_id}: Invalid main file name: {options.main_file}")
raise HTTPException(
status_code=400,
detail="Main file name must be a valid LaTeX filename (e.g., main.tex)"
)
# Create the job record
job_data = {
"id": job_id,
"status": "uploading",
"created_at": start_time,
"options": options.dict(),
"api_key": api_key,
}
store_job(job_id, job_data)
try:
# Create a temporary directory for this job
work_dir = tempfile.mkdtemp(prefix=f"tex2pdf_{job_id}_")
update_job(job_id, {
"status": "extracting",
"work_dir": work_dir
})
# Read zip file to memory
zip_content = await zip_file.read()
if len(zip_content) > MAX_UPLOAD_SIZE:
logger.warning(f"Job {job_id}: File too large: {len(zip_content)} bytes")
update_job(job_id, {
"status": "failed",
"error": f"File too large. Maximum size: {MAX_UPLOAD_SIZE/1024/1024} MB"
})
return {
"job_id": job_id,
"status": "failed",
"message": "File too large"
}
# Extract zip files safely
try:
sanitize_zip_archive(BytesIO(zip_content), work_dir)
update_job(job_id, {"status": "queued"})
except ValueError as e:
logger.warning(f"Job {job_id}: Zip extraction failed: {str(e)}")
update_job(job_id, {
"status": "failed",
"error": f"Zip extraction failed: {str(e)}"
})
return {
"job_id": job_id,
"status": "failed",
"message": str(e)
}
# Start compilation in background
background_tasks.add_task(
compile_latex,
job_id,
work_dir,
options.main_file,
options.num_runs,
options.use_bibtex
)
return {
"job_id": job_id,
"status": "processing",
"message": "Conversion job started"
}
except Exception as e:
logger.error(f"Job {job_id}: Unexpected error: {str(e)}", exc_info=True)
update_job(job_id, {
"status": "failed",
"error": f"Unexpected error: {str(e)}"
})
return {
"job_id": job_id,
"status": "failed",
"message": "Server error"
}
@app.get("/tex2pdf/status/{job_id}",
dependencies=[Depends(verify_api_key)],
summary="Check the status of a conversion job")
async def check_job_status(job_id: str):
"""Check the status of a previously submitted conversion job."""
job = get_job(job_id)
if not job:
raise HTTPException(
status_code=404,
detail="Job not found"
)
# Clean sensitive or internal information
response = {
"job_id": job_id,
"status": job["status"],
"created_at": job["created_at"],
}
# Add error details if failed
if job["status"] == "failed" and "error" in job:
response["error"] = job["error"]
# Add progress info if processing
if job["status"] == "processing" and "progress" in job:
response["progress"] = job["progress"]
return response
@app.get("/tex2pdf/download/{job_id}",
dependencies=[Depends(verify_api_key)],
summary="Download the generated PDF")
async def download_pdf(job_id: str):
"""Download the PDF generated by a completed conversion job."""
job = get_job(job_id)
if not job:
raise HTTPException(
status_code=404,
detail="Job not found"
)
if job["status"] != "completed":
raise HTTPException(
status_code=400,
detail=f"PDF not ready. Current status: {job['status']}"
)
try:
# Option 1: Get PDF from memory and stream it
# pdf_content = get_pdf(job_id)
# if not pdf_content:
# raise HTTPException(
# status_code=404,
# detail="PDF file not found in storage"
# )
#
# # Generate a filename based on the job ID
# filename = f"document_{job_id[-6:]}.pdf"
#
# return StreamingResponse(
# BytesIO(pdf_content),
# media_type='application/pdf',
# headers={"Content-Disposition": f"attachment; filename={filename}"}
# )
# Option 2: Use FileResponse for more efficient file serving
pdf_path = get_pdf_path(job_id)
if not os.path.exists(pdf_path):
raise HTTPException(
status_code=404,
detail="PDF file not found in storage"
)
filename = f"document_{job_id[-6:]}.pdf"
return FileResponse(
pdf_path,
media_type='application/pdf',
filename=filename
)
except Exception as e:
logger.error(f"Error delivering PDF for job {job_id}: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail="Error retrieving PDF file"
)
@app.get("/health", summary="Health check endpoint")
async def health_check():
"""Simple health check endpoint to verify the API is running."""
try:
# Check database connection
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.execute("SELECT 1")
cursor.fetchone()
db_status = "connected"
except Exception as e:
db_status = f"error: {str(e)}"
return {
"status": "healthy",
"version": VERSION,
"database": db_status,
"storage": os.path.exists(JOBS_DIR) and os.access(JOBS_DIR, os.W_OK)
}
@app.on_event("startup")
async def startup_event():
logger.info("Service starting up")
# Initialize the database
init_db()
# Start background cleanup task
asyncio.create_task(cleanup_old_jobs())
@app.on_event("shutdown")
async def shutdown_event():
"""Clean up on shutdown"""
logger.info("Service shutting down")
executor.shutdown(wait=False)

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
fastapi==0.110.0
uvicorn==0.27.1
pydantic==2.5.3
python-multipart==0.0.6
aiofiles==23.2.1