Version 2.3.2 (Complete refactor)
This commit is contained in:
parent
dcb57e5a2b
commit
9963a25e4a
10
.dockerignore
Normal file
10
.dockerignore
Normal file
@ -0,0 +1,10 @@
|
||||
.gitignore
|
||||
.dockerignore
|
||||
.git
|
||||
.archive
|
||||
.env
|
||||
.env.*
|
||||
logs
|
||||
Dockerfile
|
||||
README.md
|
||||
docker-compose.yaml
|
80
Dockerfile
80
Dockerfile
@ -1,14 +1,72 @@
|
||||
FROM archlinux:base
|
||||
|
||||
# Update and install necessary packages
|
||||
RUN pacman -Sy --noconfirm curl reflector && \
|
||||
reflector --latest 5 --sort rate --save /etc/pacman.d/mirrorlist && \
|
||||
sed -i '/\[options\]/a XferCommand = /usr/bin/curl -C - --fail --retry 3 --retry-delay 3 -o %o %u' /etc/pacman.conf && \
|
||||
pacman -Syu --noconfirm --needed texlive-basic texlive-bibtexextra texlive-bin texlive-binextra texlive-context texlive-fontsrecommended texlive-fontsextra texlive-fontutils texlive-formatsextra texlive-langenglish texlive-langeuropean texlive-langfrench texlive-langgerman texlive-latex texlive-latexextra texlive-latexrecommended texlive-luatex texlive-mathscience texlive-metapost texlive-music texlive-pictures texlive-plaingeneric texlive-pstricks texlive-publishers && \
|
||||
pacman -Syu --noconfirm --needed python-fastapi uvicorn python-python-multipart && \
|
||||
yes | pacman -Scc
|
||||
FROM python:3.11-slim AS builder
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
COPY main.py .
|
||||
|
||||
ENTRYPOINT ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip wheel --no-cache-dir --wheel-dir /app/wheels -r requirements.txt
|
||||
|
||||
|
||||
# Create final image
|
||||
FROM python:3.11-slim AS final
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONPATH=/app \
|
||||
PORT=8000 \
|
||||
MAX_WORKERS=4 \
|
||||
TZ=UTC \
|
||||
JOBS_DIR=/data/jobs \
|
||||
DB_PATH=/data/db/jobs.db
|
||||
|
||||
# Create a non-root user
|
||||
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
||||
|
||||
# Install LaTeX and required dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
texlive-full \
|
||||
tini \
|
||||
sqlite3 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Create persistent directories
|
||||
RUN mkdir -p /data/jobs /data/db /app/temp && \
|
||||
chown -R appuser:appuser /app /data
|
||||
|
||||
# Copy Python wheels from builder stage
|
||||
COPY --from=builder /app/wheels /wheels
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir /wheels/* && rm -rf /wheels
|
||||
|
||||
# Copy application code
|
||||
COPY . /app/
|
||||
|
||||
# Create entrypoint script that properly handles environment variables
|
||||
RUN echo '#!/bin/sh\n\
|
||||
exec uvicorn main:app --host 0.0.0.0 --port $PORT --workers $MAX_WORKERS --log-level info\n\
|
||||
' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
|
||||
|
||||
# Switch to non-root user
|
||||
USER appuser
|
||||
|
||||
# Expose the service port
|
||||
EXPOSE 8000
|
||||
|
||||
# Use tini as init
|
||||
ENTRYPOINT ["/usr/bin/tini", "--"]
|
||||
|
||||
# Run the application with proper signal handling
|
||||
CMD ["/app/entrypoint.sh"]
|
||||
|
219
README.md
219
README.md
@ -1,58 +1,213 @@
|
||||
# LaTeX-to-PDF Conversion Service
|
||||
# LaTeX to PDF Conversion Service
|
||||
|
||||
This service provides an API endpoint to convert LaTeX documents into PDF format. It supports `.zip` file uploads containing the LaTeX source file (`main.tex`) and any associated files (e.g., images or additional `.tex` files).
|
||||
A high-performance, secure REST API for converting LaTeX documents to PDF format.
|
||||
|
||||
## Getting Started
|
||||
## Features
|
||||
|
||||
### Building the Docker Image
|
||||
- **Simple API**: Upload a ZIP file containing LaTeX documents and get a PDF back
|
||||
- **Secure Processing**: Comprehensive security measures including input validation and sanitization
|
||||
- **Multiple Workers**: Designed for concurrency with shared file system and SQLite database
|
||||
- **Robust Error Handling**: Detailed error messages with LaTeX compilation logs
|
||||
- **Automatic Cleanup**: Background process removes expired PDFs and temporary files
|
||||
- **Configurable Options**: Multiple compilation runs, BibTeX support, custom main file name
|
||||
- **API Key Authentication**: Optional security layer with configurable API keys
|
||||
- **Rate Limiting**: Protection against API abuse
|
||||
- **Resource Control**: Limits on file sizes and compilation time
|
||||
- **Docker Ready**: Ready-to-use Docker and Docker Compose configurations
|
||||
|
||||
To build the Docker image for the conversion service, navigate to the project directory and run:
|
||||
## Quick Start
|
||||
|
||||
The easiest way to run the service is with Docker Compose:
|
||||
|
||||
```bash
|
||||
docker build -t rbehzadan/tex2pdf .
|
||||
# Clone the repository
|
||||
git clone https://github.com/yourusername/tex2pdf.git
|
||||
cd tex2pdf
|
||||
|
||||
# Start the service
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### Running the Service
|
||||
The service will be available at `http://localhost:8000`.
|
||||
|
||||
After building the image, you can start the service with the following command:
|
||||
## API Usage
|
||||
|
||||
### Convert LaTeX to PDF
|
||||
|
||||
```bash
|
||||
docker run -d -p 8000:8000 rbehzadan/tex2pdf
|
||||
curl -X POST \
|
||||
-H "X-API-Key: 1234" \
|
||||
-F "zip_file=@my_latex_files.zip" \
|
||||
http://localhost:8000/tex2pdf
|
||||
```
|
||||
|
||||
This command runs the Docker container in the background (`-d`) and maps port `8000` of the container to port `8000` on the host, making the service accessible at `http://localhost:8000`.
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"job_id": "28f5bf9b-587f-4f3c-a3de-4d737d9736ce",
|
||||
"status": "processing",
|
||||
"message": "Conversion job started"
|
||||
}
|
||||
```
|
||||
|
||||
## API Endpoint
|
||||
|
||||
The service exposes a single POST endpoint at `/tex2pdf` for converting LaTeX to PDF.
|
||||
|
||||
### Uploading a `.zip` File
|
||||
|
||||
The `.zip` file should contain a `main.tex` file and can include additional resources such as images or other `.tex` files used by `main.tex`.
|
||||
|
||||
## Manual Testing
|
||||
|
||||
### Testing with `curl`
|
||||
|
||||
To test the conversion service with `curl`, use the following command:
|
||||
### Check Job Status
|
||||
|
||||
```bash
|
||||
curl -X POST -F "zip_file=@path/to/your/file.zip" http://localhost:8000/tex2pdf -o output.pdf
|
||||
curl -X GET \
|
||||
-H "X-API-Key: 1234" \
|
||||
http://localhost:8000/tex2pdf/status/28f5bf9b-587f-4f3c-a3de-4d737d9736ce
|
||||
```
|
||||
|
||||
Replace `path/to/your/file.zip` with the actual path to your `.zip` file. The resulting PDF will be saved as `output.pdf` in the current directory.
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"job_id": "28f5bf9b-587f-4f3c-a3de-4d737d9736ce",
|
||||
"status": "completed",
|
||||
"created_at": 1741424390.6039968
|
||||
}
|
||||
```
|
||||
|
||||
### Testing with HTTPie
|
||||
|
||||
HTTPie offers a more user-friendly way to test the service. Use the following command for testing:
|
||||
### Download PDF
|
||||
|
||||
```bash
|
||||
http -f POST http://localhost:8000/tex2pdf zip_file@path/to/your/file.zip > output.pdf
|
||||
curl -X GET \
|
||||
-H "X-API-Key: 1234" \
|
||||
-o output.pdf \
|
||||
http://localhost:8000/tex2pdf/download/28f5bf9b-587f-4f3c-a3de-4d737d9736ce
|
||||
```
|
||||
|
||||
As with `curl`, replace `path/to/your/file.zip` with the path to your `.zip` file. The output will be redirected to `output.pdf` in the current directory.
|
||||
### Health Check
|
||||
|
||||
## Troubleshooting
|
||||
```bash
|
||||
curl http://localhost:8000/health
|
||||
```
|
||||
|
||||
If you encounter any issues with the conversion process, ensure that your `.zip` file is structured correctly, with a `main.tex` file at the root. For more detailed error information, consult the service logs.
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"version": "1.0.0",
|
||||
"database": "connected",
|
||||
"storage": true
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Compilation Options
|
||||
|
||||
You can customize the LaTeX compilation process:
|
||||
|
||||
```bash
|
||||
curl -X POST \
|
||||
-H "X-API-Key: 1234" \
|
||||
-F "zip_file=@my_latex_files.zip" \
|
||||
-F "options={\"main_file\": \"document.tex\", \"num_runs\": 3, \"use_bibtex\": true}" \
|
||||
http://localhost:8000/tex2pdf
|
||||
```
|
||||
|
||||
Options:
|
||||
- `main_file`: Main LaTeX file to compile (default: `main.tex`)
|
||||
- `num_runs`: Number of compilation runs (default: 2)
|
||||
- `use_bibtex`: Run BibTeX for bibliography processing (default: false)
|
||||
|
||||
## ZIP File Requirements
|
||||
|
||||
- The ZIP file must contain all necessary files for LaTeX compilation
|
||||
- By default, the service looks for `main.tex` as the main file
|
||||
- All referenced files (images, styles, etc.) should be included
|
||||
- Paths in LaTeX files should be relative and match the ZIP structure
|
||||
|
||||
## Configuration
|
||||
|
||||
The service can be configured via environment variables in the docker-compose.yml file:
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `ALLOWED_API_KEYS` | Comma-separated list of valid API keys | "" (empty = no auth) |
|
||||
| `API_KEY_REQUIRED` | Enable/disable API key validation | "true" |
|
||||
| `MAX_WORKERS` | Number of uvicorn workers | 2 |
|
||||
| `MAX_UPLOAD_SIZE` | Maximum file upload size in bytes | 52428800 (50MB) |
|
||||
| `MAX_COMPILATION_TIME` | Maximum LaTeX compilation time in seconds | 240 |
|
||||
| `RATE_LIMIT_WINDOW` | Rate limiting window in seconds | 60 |
|
||||
| `MAX_REQUESTS_PER_WINDOW` | Maximum requests per rate limit window | 10 |
|
||||
| `JOB_EXPIRY` | Job expiry time in seconds | 3600 (1 hour) |
|
||||
| `JOBS_DIR` | Directory for storing PDF files | "/data/jobs" |
|
||||
| `DB_PATH` | Path to SQLite database | "/data/db/jobs.db" |
|
||||
|
||||
## Deployment
|
||||
|
||||
### System Requirements
|
||||
|
||||
- Docker and Docker Compose
|
||||
- For running without Docker:
|
||||
- Python 3.10+
|
||||
- LaTeX distribution (texlive)
|
||||
- SQLite3
|
||||
|
||||
### Production Deployment Considerations
|
||||
|
||||
For production deployments, consider:
|
||||
|
||||
1. **Configure a reverse proxy** (like Nginx) with HTTPS
|
||||
2. **Adjust resource limits** based on your workload
|
||||
3. **Set strong API keys** and restrict access
|
||||
4. **Mount persistent volumes** for job data
|
||||
5. **Monitor disk usage** and adjust `JOB_EXPIRY` accordingly
|
||||
6. **Set up logging** to a centralized logging service
|
||||
|
||||
## Architecture
|
||||
|
||||
The service uses a stateless design with background processing:
|
||||
|
||||
1. **FastAPI Application**: Handles HTTP requests and responses
|
||||
2. **SQLite Database**: Stores job metadata and status
|
||||
3. **File System**: Stores generated PDFs and temporary files
|
||||
4. **Background Tasks**: Process LaTeX compilation asynchronously
|
||||
|
||||
## Development
|
||||
|
||||
### Local Development Setup
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/yourusername/tex2pdf.git
|
||||
cd tex2pdf
|
||||
|
||||
# Create a virtual environment
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Run the service
|
||||
uvicorn main:app --reload --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
|
||||
```bash
|
||||
pytest tests/
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
[MIT License](LICENSE)
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please feel free to submit a Pull Request.
|
||||
|
||||
## Security Considerations
|
||||
|
||||
While this service implements several security measures:
|
||||
|
||||
- API key authentication
|
||||
- Input validation
|
||||
- Rate limiting
|
||||
- Safe ZIP extraction
|
||||
- Process isolation
|
||||
|
||||
Be aware that allowing users to run LaTeX compilation on your server carries inherent risks. Always deploy behind a secure gateway in production environments.
|
||||
|
||||
|
21
docker-compose.yaml
Normal file
21
docker-compose.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
services:
|
||||
app:
|
||||
image: rbehzadan/tex2pdf
|
||||
container_name: tex2pdf
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- ALLOWED_API_KEYS=1234,5678,abcd # Comma-separated list of allowed API keys
|
||||
- API_KEY_REQUIRED=true # Set to "false" to disable API key validation
|
||||
- MAX_WORKERS=4 # Number of uvicorn workers
|
||||
- MAX_UPLOAD_SIZE=52428800 # 50MB in bytes
|
||||
- MAX_COMPILATION_TIME=240 # Maximum LaTeX compilation time in seconds
|
||||
- RATE_LIMIT_WINDOW=60 # Rate limiting window in seconds
|
||||
- MAX_REQUESTS_PER_WINDOW=10 # Maximum requests per rate limit window
|
||||
- JOB_EXPIRY=3600 # Job expiry time in seconds (1 hour)
|
||||
volumes:
|
||||
- pdf_data:/data
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
pdf_data:
|
730
main.py
730
main.py
@ -1,60 +1,706 @@
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException, Depends, BackgroundTasks, Request
|
||||
from fastapi.responses import StreamingResponse, FileResponse
|
||||
from io import BytesIO
|
||||
import asyncio
|
||||
import tempfile
|
||||
import zipfile
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
import re
|
||||
import uuid
|
||||
import json
|
||||
import time
|
||||
from typing import Optional, Dict, List, Any
|
||||
from pathlib import Path
|
||||
import contextlib
|
||||
from pydantic import BaseModel, Field
|
||||
import sqlite3
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
app = FastAPI()
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("tex2pdf-service")
|
||||
|
||||
@app.post("/tex2pdf")
|
||||
async def convert_to_pdf(zip_file: UploadFile = File(...)):
|
||||
if zip_file.filename.endswith('.zip'):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
# Unpack the zip file
|
||||
with zipfile.ZipFile(BytesIO(await zip_file.read())) as z:
|
||||
z.extractall(tmpdirname)
|
||||
app = FastAPI(title="LaTeX to PDF Conversion Service")
|
||||
|
||||
# Change working directory to tmpdirname
|
||||
os.chdir(tmpdirname)
|
||||
# Configuration
|
||||
MAX_UPLOAD_SIZE = int(os.environ.get("MAX_UPLOAD_SIZE", 50 * 1024 * 1024)) # Default: 50 MB
|
||||
API_KEY_NAME = os.environ.get("API_KEY_NAME", "X-API-Key")
|
||||
ALLOWED_API_KEYS = os.environ.get("ALLOWED_API_KEYS", "").split(",")
|
||||
MAX_COMPILATION_TIME = int(os.environ.get("MAX_COMPILATION_TIME", 240)) # Default: 240 seconds
|
||||
RATE_LIMIT_WINDOW = int(os.environ.get("RATE_LIMIT_WINDOW", 60)) # Default: 60 seconds
|
||||
MAX_REQUESTS_PER_WINDOW = int(os.environ.get("MAX_REQUESTS_PER_WINDOW", 10)) # Default: 10 requests
|
||||
JOB_EXPIRY = int(os.environ.get("JOB_EXPIRY", 3600)) # Default: 1 hour
|
||||
JOBS_DIR = os.environ.get("JOBS_DIR", "/app/jobs")
|
||||
DB_PATH = os.environ.get("DB_PATH", "/app/db/jobs.db")
|
||||
API_KEY_REQUIRED = len(ALLOWED_API_KEYS) > 0
|
||||
if API_KEY_REQUIRED:
|
||||
API_KEY_REQUIRED = os.environ.get("API_KEY_REQUIRED", "true").lower() in ("true", "1", "yes")
|
||||
VERSION=open("VERSION").read().strip()
|
||||
|
||||
# Find the main LaTeX file (assuming a convention, e.g., main.tex)
|
||||
main_tex_file = 'main.tex'
|
||||
main_tex_path = os.path.join(tmpdirname, main_tex_file)
|
||||
if not os.path.exists(main_tex_path):
|
||||
raise HTTPException(status_code=400, detail="Main LaTeX file (main.tex) not found in the zip.")
|
||||
# Create necessary directories
|
||||
os.makedirs(JOBS_DIR, exist_ok=True)
|
||||
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
|
||||
|
||||
# Compile the LaTeX document
|
||||
cmd = ['pdflatex', '-interaction=nonstopmode', '-output-directory', tmpdirname, main_tex_path]
|
||||
process = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
|
||||
# Initialize SQLite database
|
||||
def init_db():
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
id TEXT PRIMARY KEY,
|
||||
status TEXT NOT NULL,
|
||||
created_at REAL NOT NULL,
|
||||
work_dir TEXT,
|
||||
api_key TEXT,
|
||||
options TEXT,
|
||||
error TEXT,
|
||||
progress TEXT,
|
||||
updated_at REAL NOT NULL
|
||||
)
|
||||
''')
|
||||
# Add index for faster lookups
|
||||
conn.execute('CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status)')
|
||||
conn.execute('CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at)')
|
||||
|
||||
# Thread pool for database operations
|
||||
executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
# In-memory rate limiting
|
||||
rate_limits: Dict[str, List[float]] = {}
|
||||
|
||||
class ConversionOptions(BaseModel):
|
||||
main_file: str = Field(default="main.tex", description="Main LaTeX file to compile")
|
||||
num_runs: int = Field(default=2, ge=1, le=5, description="Number of compilation runs")
|
||||
use_bibtex: bool = Field(default=False, description="Run BibTeX for bibliography")
|
||||
|
||||
def verify_api_key(request: Request):
|
||||
# If API keys are not required, skip validation
|
||||
if not API_KEY_REQUIRED:
|
||||
return "no_auth"
|
||||
|
||||
api_key = request.headers.get(API_KEY_NAME)
|
||||
|
||||
# Check if API key is provided and valid
|
||||
if not api_key:
|
||||
logger.warning("Missing API key in request")
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="API key required",
|
||||
)
|
||||
|
||||
if not ALLOWED_API_KEYS or api_key not in ALLOWED_API_KEYS:
|
||||
logger.warning(f"Unauthorized access attempt with API key: {api_key[:5]}...")
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="Invalid API key",
|
||||
)
|
||||
|
||||
return api_key
|
||||
|
||||
def check_rate_limit(request: Request, api_key: str = Depends(verify_api_key)):
|
||||
client_id = api_key or request.client.host
|
||||
current_time = time.time()
|
||||
|
||||
if client_id not in rate_limits:
|
||||
rate_limits[client_id] = []
|
||||
|
||||
# Remove timestamps outside the window
|
||||
rate_limits[client_id] = [t for t in rate_limits[client_id] if current_time - t < RATE_LIMIT_WINDOW]
|
||||
|
||||
if len(rate_limits[client_id]) >= MAX_REQUESTS_PER_WINDOW:
|
||||
logger.warning(f"Rate limit exceeded for {client_id[:5]}...")
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail=f"Rate limit exceeded. Maximum {MAX_REQUESTS_PER_WINDOW} requests per {RATE_LIMIT_WINDOW} seconds.",
|
||||
)
|
||||
|
||||
rate_limits[client_id].append(current_time)
|
||||
return client_id
|
||||
|
||||
def validate_latex_filename(filename: str) -> bool:
|
||||
"""Validate if the filename follows safe LaTeX filename conventions."""
|
||||
return bool(re.match(r'^[a-zA-Z0-9_\-\.]+\.tex$', filename))
|
||||
|
||||
def sanitize_zip_archive(zip_file_obj, extract_path):
|
||||
"""Extracts zip contents safely, preventing directory traversal attacks."""
|
||||
try:
|
||||
print(f"Running pdflatex on {main_tex_path}")
|
||||
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=120)
|
||||
# print(f"pdflatex output: {stdout.decode()}")
|
||||
# print(f"pdflatex errors: {stderr.decode()}")
|
||||
except asyncio.TimeoutError:
|
||||
return {"error": "Conversion timed out."}
|
||||
with zipfile.ZipFile(zip_file_obj) as zip_ref:
|
||||
# Log zip contents for debugging
|
||||
logger.info(f"ZIP contents: {zip_ref.namelist()}")
|
||||
|
||||
# First, check for suspicious paths
|
||||
for file_info in zip_ref.infolist():
|
||||
# Convert to Path for safer path handling
|
||||
file_path = Path(file_info.filename)
|
||||
|
||||
# Check for absolute paths or directory traversal attempts
|
||||
if file_path.is_absolute() or '..' in file_path.parts:
|
||||
raise ValueError(f"Suspicious path detected: {file_info.filename}")
|
||||
|
||||
# Check for extremely large files
|
||||
if file_info.file_size > MAX_UPLOAD_SIZE:
|
||||
raise ValueError(f"File too large: {file_info.filename}")
|
||||
|
||||
# If all files pass validation, extract them
|
||||
for file_info in zip_ref.infolist():
|
||||
# Skip directories
|
||||
if file_info.filename.endswith('/'):
|
||||
continue
|
||||
|
||||
# Create a safe extraction path
|
||||
target_path = Path(extract_path) / file_info.filename
|
||||
|
||||
# Create parent directories if they don't exist
|
||||
target_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Extract the file
|
||||
with zip_ref.open(file_info) as source, open(target_path, 'wb') as target:
|
||||
shutil.copyfileobj(source, target)
|
||||
|
||||
# List extracted files for debugging
|
||||
extracted_files = list(Path(extract_path).glob('**/*'))
|
||||
logger.info(f"Extracted files: {[str(f.relative_to(extract_path)) for f in extracted_files]}")
|
||||
|
||||
return True
|
||||
except zipfile.BadZipFile:
|
||||
raise ValueError("Invalid ZIP file format")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during ZIP extraction: {str(e)}", exc_info=True)
|
||||
raise ValueError(f"Error extracting ZIP: {str(e)}")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def working_directory(path):
|
||||
"""Changes working directory within the context and reverts back afterwards."""
|
||||
origin = os.getcwd()
|
||||
try:
|
||||
os.chdir(path)
|
||||
yield
|
||||
finally:
|
||||
os.chdir(origin)
|
||||
|
||||
async def run_latex_command(cmd, timeout=MAX_COMPILATION_TIME):
|
||||
"""Run a LaTeX-related command with proper timeout and error handling."""
|
||||
logger.info(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
stdout_text = stdout.decode('utf-8', errors='replace')
|
||||
stderr_text = stderr.decode('utf-8', errors='replace')
|
||||
|
||||
logger.info(f"Command returned with code {process.returncode}")
|
||||
if process.returncode != 0:
|
||||
# Compilation failed
|
||||
logger.warning(f"Command failed with stderr: {stderr_text[:500]}...")
|
||||
|
||||
return {
|
||||
"error": "Conversion failed.",
|
||||
"details": {
|
||||
"stderr": stderr.decode(),
|
||||
"stdout": stdout.decode(),
|
||||
},
|
||||
"returncode": process.returncode,
|
||||
"stdout": stdout_text,
|
||||
"stderr": stderr_text
|
||||
}
|
||||
except asyncio.TimeoutError:
|
||||
# Try to terminate the process
|
||||
logger.error(f"Command timed out after {timeout} seconds: {' '.join(cmd)}")
|
||||
process.terminate()
|
||||
try:
|
||||
await asyncio.wait_for(process.wait(), timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
# If it doesn't terminate, force kill
|
||||
process.kill()
|
||||
|
||||
raise TimeoutError(f"Command timed out after {timeout} seconds: {' '.join(cmd)}")
|
||||
|
||||
# Database operations
|
||||
def store_job(job_id: str, job_data: Dict[str, Any]):
|
||||
"""Store job data in SQLite database"""
|
||||
current_time = time.time()
|
||||
|
||||
# Extract fields from job_data
|
||||
status = job_data.get("status", "unknown")
|
||||
created_at = job_data.get("created_at", current_time)
|
||||
work_dir = job_data.get("work_dir", "")
|
||||
api_key = job_data.get("api_key", "")
|
||||
options = json.dumps(job_data.get("options", {}))
|
||||
error = job_data.get("error", "")
|
||||
progress = job_data.get("progress", "")
|
||||
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.execute(
|
||||
'''
|
||||
INSERT OR REPLACE INTO jobs
|
||||
(id, status, created_at, work_dir, api_key, options, error, progress, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''',
|
||||
(job_id, status, created_at, work_dir, api_key, options, error, progress, current_time)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def get_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Retrieve job data from SQLite database"""
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute('SELECT * FROM jobs WHERE id = ?', (job_id,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
job_data = dict(row)
|
||||
# Parse options back to dict
|
||||
if job_data.get('options'):
|
||||
job_data['options'] = json.loads(job_data['options'])
|
||||
return job_data
|
||||
return None
|
||||
|
||||
def update_job(job_id: str, updates: Dict[str, Any]):
|
||||
"""Update specific fields in the job data"""
|
||||
current_time = time.time()
|
||||
|
||||
# Start with SET updated_at=?
|
||||
set_values = ["updated_at=?"]
|
||||
params = [current_time]
|
||||
|
||||
# Add each update field
|
||||
for key, value in updates.items():
|
||||
if key == 'options':
|
||||
value = json.dumps(value)
|
||||
set_values.append(f"{key}=?")
|
||||
params.append(value)
|
||||
|
||||
# Add job_id as the last parameter
|
||||
params.append(job_id)
|
||||
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
query = f"UPDATE jobs SET {', '.join(set_values)} WHERE id = ?"
|
||||
conn.execute(query, params)
|
||||
conn.commit()
|
||||
|
||||
def get_pdf_path(job_id: str) -> str:
|
||||
"""Get the path where the PDF should be stored"""
|
||||
return os.path.join(JOBS_DIR, f"{job_id}.pdf")
|
||||
|
||||
def store_pdf(job_id: str, pdf_content: bytes):
|
||||
"""Store PDF in the filesystem"""
|
||||
pdf_path = get_pdf_path(job_id)
|
||||
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
|
||||
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(pdf_content)
|
||||
|
||||
def get_pdf(job_id: str) -> Optional[bytes]:
|
||||
"""Retrieve PDF from the filesystem"""
|
||||
pdf_path = get_pdf_path(job_id)
|
||||
if os.path.exists(pdf_path):
|
||||
with open(pdf_path, 'rb') as f:
|
||||
return f.read()
|
||||
return None
|
||||
|
||||
async def compile_latex(
|
||||
job_id: str,
|
||||
work_dir: str,
|
||||
main_file: str,
|
||||
num_runs: int,
|
||||
use_bibtex: bool
|
||||
):
|
||||
"""Compile LaTeX document with proper error handling and multiple runs if needed."""
|
||||
results = []
|
||||
main_tex_path = os.path.join(work_dir, main_file)
|
||||
|
||||
# Verify the main file exists
|
||||
if not os.path.exists(main_tex_path):
|
||||
logger.error(f"Main LaTeX file not found: {main_tex_path}")
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": f"Main LaTeX file ({main_file}) not found in the archive."
|
||||
})
|
||||
return False
|
||||
|
||||
# List directory contents for debugging
|
||||
logger.info(f"Work directory contents: {os.listdir(work_dir)}")
|
||||
|
||||
try:
|
||||
with working_directory(work_dir):
|
||||
# Run pdflatex multiple times as needed
|
||||
for i in range(num_runs):
|
||||
update_job(job_id, {
|
||||
"status": "processing",
|
||||
"progress": f"LaTeX compilation {i+1}/{num_runs}"
|
||||
})
|
||||
|
||||
# For verbose output to diagnose issues
|
||||
cmd = [
|
||||
'pdflatex',
|
||||
'-interaction=nonstopmode',
|
||||
'-file-line-error',
|
||||
main_file
|
||||
]
|
||||
|
||||
try:
|
||||
result = await run_latex_command(cmd)
|
||||
results.append(result)
|
||||
|
||||
# If compilation failed, stop and provide details
|
||||
if result["returncode"] != 0:
|
||||
# Extract relevant error messages
|
||||
error_lines = []
|
||||
for line in result["stdout"].split('\n'):
|
||||
if ":" in line and ("Error" in line or "Fatal" in line):
|
||||
error_lines.append(line)
|
||||
|
||||
error_message = "LaTeX compilation failed"
|
||||
if error_lines:
|
||||
error_message = f"LaTeX errors: {' | '.join(error_lines[:3])}"
|
||||
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": error_message,
|
||||
"details": json.dumps(result)
|
||||
})
|
||||
return False
|
||||
|
||||
# Run bibtex if requested (after the first pdflatex run)
|
||||
if use_bibtex and i == 0:
|
||||
update_job(job_id, {
|
||||
"status": "processing",
|
||||
"progress": "Running BibTeX"
|
||||
})
|
||||
|
||||
basename = os.path.splitext(main_file)[0]
|
||||
bibtex_cmd = ['bibtex', basename]
|
||||
|
||||
bibtex_result = await run_latex_command(bibtex_cmd)
|
||||
results.append(bibtex_result)
|
||||
|
||||
except TimeoutError as e:
|
||||
logger.error(f"Timeout during compilation: {str(e)}")
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": str(e)
|
||||
})
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during compilation: {str(e)}", exc_info=True)
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": f"Unexpected error: {str(e)}"
|
||||
})
|
||||
return False
|
||||
|
||||
# Check if the PDF was generated
|
||||
pdf_basename = os.path.splitext(main_file)[0]
|
||||
pdf_path = os.path.join(work_dir, f"{pdf_basename}.pdf")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
logger.error(f"PDF not generated at expected path: {pdf_path}")
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": "PDF file not generated despite successful compilation"
|
||||
})
|
||||
return False
|
||||
|
||||
# Store the PDF in the filesystem
|
||||
with open(pdf_path, 'rb') as f:
|
||||
pdf_content = f.read()
|
||||
store_pdf(job_id, pdf_content)
|
||||
|
||||
# Update job status
|
||||
update_job(job_id, {
|
||||
"status": "completed",
|
||||
})
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception in compile_latex: {str(e)}", exc_info=True)
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": f"Unexpected error: {str(e)}"
|
||||
})
|
||||
return False
|
||||
|
||||
# Clean up old jobs (runs in background)
|
||||
async def cleanup_old_jobs():
|
||||
"""Clean up old jobs and their resources"""
|
||||
while True:
|
||||
try:
|
||||
current_time = time.time()
|
||||
expiry_time = current_time - JOB_EXPIRY
|
||||
|
||||
# Get expired jobs
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute('SELECT id, work_dir FROM jobs WHERE created_at < ?', (expiry_time,))
|
||||
expired_jobs = cursor.fetchall()
|
||||
|
||||
for job in expired_jobs:
|
||||
job_id = job['id']
|
||||
work_dir = job['work_dir']
|
||||
|
||||
# Clean up PDF if it exists
|
||||
pdf_path = get_pdf_path(job_id)
|
||||
if os.path.exists(pdf_path):
|
||||
os.remove(pdf_path)
|
||||
|
||||
# Clean up work directory if it exists
|
||||
if work_dir and os.path.exists(work_dir):
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
|
||||
# Remove job from database
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.execute('DELETE FROM jobs WHERE id = ?', (job_id,))
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Cleaned up expired job {job_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup task: {str(e)}", exc_info=True)
|
||||
|
||||
# Run cleanup every 15 minutes
|
||||
await asyncio.sleep(900)
|
||||
|
||||
@app.post("/tex2pdf",
|
||||
dependencies=[Depends(check_rate_limit)],
|
||||
summary="Convert LaTeX files to PDF",
|
||||
response_description="Returns job ID for status checking")
|
||||
async def convert_to_pdf(
|
||||
background_tasks: BackgroundTasks,
|
||||
request: Request,
|
||||
zip_file: UploadFile = File(...),
|
||||
options: Optional[ConversionOptions] = None
|
||||
):
|
||||
"""
|
||||
Takes a zip file containing LaTeX files and compiles them into a PDF.
|
||||
|
||||
- The zip file must contain all necessary files for compilation
|
||||
- By default, assumes main.tex is the main file unless specified otherwise
|
||||
- Returns a job ID that can be used to check status and retrieve the PDF
|
||||
"""
|
||||
api_key = verify_api_key(request)
|
||||
start_time = time.time()
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
if options is None:
|
||||
options = ConversionOptions()
|
||||
|
||||
logger.info(f"Starting conversion job {job_id}")
|
||||
|
||||
# Validate input
|
||||
if not zip_file.filename.endswith('.zip'):
|
||||
logger.warning(f"Job {job_id}: Invalid file format: {zip_file.filename}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Uploaded file must be a zip archive."
|
||||
)
|
||||
|
||||
if not validate_latex_filename(options.main_file):
|
||||
logger.warning(f"Job {job_id}: Invalid main file name: {options.main_file}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Main file name must be a valid LaTeX filename (e.g., main.tex)"
|
||||
)
|
||||
|
||||
# Create the job record
|
||||
job_data = {
|
||||
"id": job_id,
|
||||
"status": "uploading",
|
||||
"created_at": start_time,
|
||||
"options": options.dict(),
|
||||
"api_key": api_key,
|
||||
}
|
||||
store_job(job_id, job_data)
|
||||
|
||||
try:
|
||||
# Create a temporary directory for this job
|
||||
work_dir = tempfile.mkdtemp(prefix=f"tex2pdf_{job_id}_")
|
||||
update_job(job_id, {
|
||||
"status": "extracting",
|
||||
"work_dir": work_dir
|
||||
})
|
||||
|
||||
# Read zip file to memory
|
||||
zip_content = await zip_file.read()
|
||||
if len(zip_content) > MAX_UPLOAD_SIZE:
|
||||
logger.warning(f"Job {job_id}: File too large: {len(zip_content)} bytes")
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": f"File too large. Maximum size: {MAX_UPLOAD_SIZE/1024/1024} MB"
|
||||
})
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "failed",
|
||||
"message": "File too large"
|
||||
}
|
||||
|
||||
# Assuming the output PDF has the same base name as the main LaTeX file
|
||||
output_pdf_path = os.path.join(tmpdirname, 'main.pdf')
|
||||
if os.path.exists(output_pdf_path):
|
||||
with open(output_pdf_path, 'rb') as f:
|
||||
pdf_content = f.read()
|
||||
return StreamingResponse(BytesIO(pdf_content), media_type='application/pdf')
|
||||
else:
|
||||
return {"error": "PDF file not generated."}
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Uploaded file is not a zip file.")
|
||||
# Extract zip files safely
|
||||
try:
|
||||
sanitize_zip_archive(BytesIO(zip_content), work_dir)
|
||||
update_job(job_id, {"status": "queued"})
|
||||
except ValueError as e:
|
||||
logger.warning(f"Job {job_id}: Zip extraction failed: {str(e)}")
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": f"Zip extraction failed: {str(e)}"
|
||||
})
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "failed",
|
||||
"message": str(e)
|
||||
}
|
||||
|
||||
# Start compilation in background
|
||||
background_tasks.add_task(
|
||||
compile_latex,
|
||||
job_id,
|
||||
work_dir,
|
||||
options.main_file,
|
||||
options.num_runs,
|
||||
options.use_bibtex
|
||||
)
|
||||
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "processing",
|
||||
"message": "Conversion job started"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Job {job_id}: Unexpected error: {str(e)}", exc_info=True)
|
||||
update_job(job_id, {
|
||||
"status": "failed",
|
||||
"error": f"Unexpected error: {str(e)}"
|
||||
})
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "failed",
|
||||
"message": "Server error"
|
||||
}
|
||||
|
||||
@app.get("/tex2pdf/status/{job_id}",
|
||||
dependencies=[Depends(verify_api_key)],
|
||||
summary="Check the status of a conversion job")
|
||||
async def check_job_status(job_id: str):
|
||||
"""Check the status of a previously submitted conversion job."""
|
||||
job = get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Job not found"
|
||||
)
|
||||
|
||||
# Clean sensitive or internal information
|
||||
response = {
|
||||
"job_id": job_id,
|
||||
"status": job["status"],
|
||||
"created_at": job["created_at"],
|
||||
}
|
||||
|
||||
# Add error details if failed
|
||||
if job["status"] == "failed" and "error" in job:
|
||||
response["error"] = job["error"]
|
||||
|
||||
# Add progress info if processing
|
||||
if job["status"] == "processing" and "progress" in job:
|
||||
response["progress"] = job["progress"]
|
||||
|
||||
return response
|
||||
|
||||
@app.get("/tex2pdf/download/{job_id}",
|
||||
dependencies=[Depends(verify_api_key)],
|
||||
summary="Download the generated PDF")
|
||||
async def download_pdf(job_id: str):
|
||||
"""Download the PDF generated by a completed conversion job."""
|
||||
job = get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Job not found"
|
||||
)
|
||||
|
||||
if job["status"] != "completed":
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"PDF not ready. Current status: {job['status']}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Option 1: Get PDF from memory and stream it
|
||||
# pdf_content = get_pdf(job_id)
|
||||
# if not pdf_content:
|
||||
# raise HTTPException(
|
||||
# status_code=404,
|
||||
# detail="PDF file not found in storage"
|
||||
# )
|
||||
#
|
||||
# # Generate a filename based on the job ID
|
||||
# filename = f"document_{job_id[-6:]}.pdf"
|
||||
#
|
||||
# return StreamingResponse(
|
||||
# BytesIO(pdf_content),
|
||||
# media_type='application/pdf',
|
||||
# headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
# )
|
||||
|
||||
# Option 2: Use FileResponse for more efficient file serving
|
||||
pdf_path = get_pdf_path(job_id)
|
||||
if not os.path.exists(pdf_path):
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="PDF file not found in storage"
|
||||
)
|
||||
|
||||
filename = f"document_{job_id[-6:]}.pdf"
|
||||
|
||||
return FileResponse(
|
||||
pdf_path,
|
||||
media_type='application/pdf',
|
||||
filename=filename
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error delivering PDF for job {job_id}: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Error retrieving PDF file"
|
||||
)
|
||||
|
||||
@app.get("/health", summary="Health check endpoint")
|
||||
async def health_check():
|
||||
"""Simple health check endpoint to verify the API is running."""
|
||||
try:
|
||||
# Check database connection
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
cursor = conn.execute("SELECT 1")
|
||||
cursor.fetchone()
|
||||
db_status = "connected"
|
||||
except Exception as e:
|
||||
db_status = f"error: {str(e)}"
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"version": VERSION,
|
||||
"database": db_status,
|
||||
"storage": os.path.exists(JOBS_DIR) and os.access(JOBS_DIR, os.W_OK)
|
||||
}
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
logger.info("Service starting up")
|
||||
# Initialize the database
|
||||
init_db()
|
||||
# Start background cleanup task
|
||||
asyncio.create_task(cleanup_old_jobs())
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""Clean up on shutdown"""
|
||||
logger.info("Service shutting down")
|
||||
executor.shutdown(wait=False)
|
||||
|
||||
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
fastapi==0.110.0
|
||||
uvicorn==0.27.1
|
||||
pydantic==2.5.3
|
||||
python-multipart==0.0.6
|
||||
aiofiles==23.2.1
|
Loading…
Reference in New Issue
Block a user