Extract structured timesheet data from PDF documents (typed or handwritten) using the Document Extraction API. This guide shows you how to build a complete timesheet extraction solution.
The Document Extraction API can automatically extract timesheet data including:
- Employee information (ID, name, company)
- Work shift details (start/end times, durations)
- Monthly summaries (total hours, days worked)
- Document type classification (handwritten vs typed)
- Confidence scores for quality control
Create a JSON schema that matches your timesheet structure:
{
"type": "object",
"properties": {
"confidence": {
"type": "object",
"properties": {
"image_analysis": {"type": "number", "minimum": 0, "maximum": 100},
"data_extraction": {"type": "number", "minimum": 0, "maximum": 100},
"validation": {"type": "number", "minimum": 0, "maximum": 100},
"overall": {"type": "number", "minimum": 0, "maximum": 100}
},
"required": ["image_analysis", "data_extraction", "validation", "overall"]
},
"employee_id": {"type": "string"},
"employee_name": {"type": "string"},
"company_name": {"type": "string"},
"summary": {
"type": "object",
"properties": {
"month": {"type": "string"},
"year": {"type": "integer"},
"total_days_worked": {"type": "integer"},
"total_time_minutes": {"type": "integer"},
"total_time_hours": {"type": "number"},
"total_time_formatted": {"type": "string"}
}
},
"log_entries": {
"type": "array",
"items": {
"type": "object",
"properties": {
"start_date_time": {"type": "string", "format": "date-time"},
"end_date_time": {"type": "string", "format": "date-time"},
"total_time": {"type": "integer"}
}
}
},
"document_type": {"type": "string", "enum": ["handwritten", "typed"]}
}
}Guide the AI with specific extraction instructions:
Extract all timesheet data. For confidence scores: estimate image_analysis
based on text clarity (0-100), data_extraction based on how complete the
data is (0-100), validation as 100 if all required fields are present, and
overall as the average of all three. Classify document_type as "handwritten"
if the document appears to be handwritten, otherwise "typed". For log_entries,
extract all work shifts with start and end times in ISO-8601 format with
timezone (+02:00 for Central European Time). Calculate total_time in minutes
for each entry. In the summary, calculate totals across all entries.import requests
import json
url = "https://api.aitronos.com/v1/documents/extract"
headers = {"Authorization": "Bearer YOUR_API_TOKEN"}
schema = {
# ... your schema from step 1
}
files = {"file": open("timesheet.pdf", "rb")}
data = {
"organization_id": "org_abc123",
"schema": json.dumps(schema),
"sync": "true",
"model": "gpt-4o",
"prompt": "Extract all timesheet data..." # Your custom prompt
}
response = requests.post(url, headers=headers, files=files, data=data)
result = response.json()
if result['success'] and result['status'] == 'completed':
print(f"Employee: {result['extracted_data']['employee_name']}")
print(f"Total Hours: {result['extracted_data']['summary']['total_time_formatted']}")
print(f"Confidence: {result['confidence']}%")
else:
print(f"Error: {result.get('error_message')}")import requests
import json
from typing import Dict, Any
class TimesheetExtractor:
def __init__(self, api_token: str, organization_id: str):
self.api_token = api_token
self.organization_id = organization_id
self.api_url = "https://api.aitronos.com/v1/documents/extract"
def get_schema(self) -> Dict[str, Any]:
"""Return the timesheet extraction schema."""
return {
"type": "object",
"properties": {
"confidence": {
"type": "object",
"properties": {
"image_analysis": {"type": "number", "minimum": 0, "maximum": 100},
"data_extraction": {"type": "number", "minimum": 0, "maximum": 100},
"validation": {"type": "number", "minimum": 0, "maximum": 100},
"overall": {"type": "number", "minimum": 0, "maximum": 100}
},
"required": ["image_analysis", "data_extraction", "validation", "overall"]
},
"employee_id": {"type": "string"},
"employee_name": {"type": "string"},
"company_name": {"type": "string"},
"summary": {
"type": "object",
"properties": {
"month": {"type": "string"},
"year": {"type": "integer"},
"total_days_worked": {"type": "integer"},
"total_time_minutes": {"type": "integer"},
"total_time_hours": {"type": "number"},
"total_time_formatted": {"type": "string"}
},
"required": ["month", "year", "total_days_worked",
"total_time_minutes", "total_time_hours",
"total_time_formatted"]
},
"log_entries": {
"type": "array",
"items": {
"type": "object",
"properties": {
"start_date_time": {"type": "string"},
"end_date_time": {"type": "string"},
"total_time": {"type": "integer"}
},
"required": ["start_date_time", "end_date_time", "total_time"]
}
},
"document_type": {"type": "string", "enum": ["handwritten", "typed"]}
},
"required": ["confidence", "employee_id", "employee_name",
"company_name", "summary", "log_entries", "document_type"]
}
def get_prompt(self) -> str:
"""Return the extraction prompt."""
return """Extract all timesheet data. For confidence scores: estimate
image_analysis based on text clarity (0-100), data_extraction based on
how complete the data is (0-100), validation as 100 if all required
fields are present, and overall as the average of all three. Classify
document_type as "handwritten" if the document appears to be handwritten,
otherwise "typed". For log_entries, extract all work shifts with start
and end times in ISO-8601 format with timezone (+02:00 for Central
European Time). Calculate total_time in minutes for each entry. In the
summary, calculate totals across all entries."""
def extract(self, file_path: str, sync: bool = True) -> Dict[str, Any]:
"""Extract timesheet data from a PDF file."""
headers = {"Authorization": f"Bearer {self.api_token}"}
files = {"file": open(file_path, "rb")}
data = {
"organization_id": self.organization_id,
"schema": json.dumps(self.get_schema()),
"prompt": self.get_prompt(),
"sync": str(sync).lower(),
"model": "gpt-4o"
}
response = requests.post(self.api_url, headers=headers,
files=files, data=data)
return response.json()
def validate_extraction(self, result: Dict[str, Any]) -> bool:
"""Validate extraction quality."""
if not result.get('success'):
return False
if result.get('status') != 'completed':
return False
# Check confidence threshold
confidence = result.get('confidence', 0)
if confidence < 70:
print(f"Warning: Low confidence ({confidence}%) - manual review recommended")
return False
# Check required fields
data = result.get('extracted_data', {})
required_fields = ['employee_id', 'employee_name', 'summary', 'log_entries']
for field in required_fields:
if not data.get(field):
print(f"Warning: Missing required field: {field}")
return False
return True
# Usage
extractor = TimesheetExtractor(
api_token="YOUR_API_TOKEN",
organization_id="org_abc123"
)
result = extractor.extract("timesheet.pdf")
if extractor.validate_extraction(result):
data = result['extracted_data']
print(f"✅ Extraction successful!")
print(f"Employee: {data['employee_name']} ({data['employee_id']})")
print(f"Company: {data['company_name']}")
print(f"Period: {data['summary']['month']} {data['summary']['year']}")
print(f"Total Hours: {data['summary']['total_time_formatted']}")
print(f"Days Worked: {data['summary']['total_days_worked']}")
print(f"Document Type: {data['document_type']}")
print(f"Confidence: {result['confidence']}%")
print(f"Cost: CHF {result['cost_chf']:.4f}")
else:
print("❌ Extraction failed or quality too low"){
"success": true,
"job_id": "doc_abc123",
"status": "completed",
"extracted_data": {
"confidence": {
"image_analysis": 95,
"data_extraction": 90,
"validation": 100,
"overall": 95
},
"employee_id": "12326",
"employee_name": "Frei (T) Celine",
"company_name": "Spitex Region Zofingen AG",
"summary": {
"month": "August",
"year": 2025,
"total_days_worked": 5,
"total_time_minutes": 1973,
"total_time_hours": 32.88,
"total_time_formatted": "32:53"
},
"log_entries": [
{
"start_date_time": "2025-08-07T16:50:00+02:00",
"end_date_time": "2025-08-07T22:17:00+02:00",
"total_time": 327
},
{
"start_date_time": "2025-08-08T12:00:00+02:00",
"end_date_time": "2025-08-08T12:15:00+02:00",
"total_time": 15
}
],
"document_type": "typed"
},
"explanation": "Extracted timesheet for employee 12326 (Celine Frei) from Spitex Region Zofingen AG for August 2025. Found 5 work days with a total of 32.88 hours (1,973 minutes).",
"confidence": 95.0,
"processing_time": 8.78,
"cost_chf": 0.015,
"model_used": "gpt-4o",
"created_at": "2025-12-19T14:27:07Z",
"completed_at": "2025-12-19T14:27:20Z"
}Always check confidence scores before processing data:
if result['confidence'] < 70:
# Flag for manual review
send_to_manual_review(result)
elif result['confidence'] < 85:
# Automated processing with validation
process_with_validation(result)
else:
# High confidence - automated processing
process_automatically(result)Not all fields may be present in every document:
data = result['extracted_data']
# Safe field access
employee_id = data.get('employee_id', 'UNKNOWN')
company = data.get('company_name', 'Not specified')
# Check for empty log entries
if not data.get('log_entries'):
print("Warning: No work shifts found")Verify that totals match individual entries:
def validate_totals(data: Dict[str, Any]) -> bool:
"""Verify summary totals match log entries."""
log_entries = data.get('log_entries', [])
summary = data.get('summary', {})
# Calculate actual total from entries
actual_minutes = sum(entry['total_time'] for entry in log_entries)
reported_minutes = summary.get('total_time_minutes', 0)
# Allow small rounding differences
if abs(actual_minutes - reported_minutes) > 5:
print(f"Warning: Total mismatch - {actual_minutes} vs {reported_minutes}")
return False
return TrueProcess multiple timesheets efficiently:
import os
from concurrent.futures import ThreadPoolExecutor
def process_timesheet_batch(file_paths: list[str]) -> list[Dict[str, Any]]:
"""Process multiple timesheets in parallel."""
extractor = TimesheetExtractor(
api_token=os.environ["FREDDY_API_KEY"],
organization_id="org_abc123"
)
with ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(extractor.extract, file_paths))
return results
# Process all PDFs in a directory
pdf_files = [f for f in os.listdir("timesheets/") if f.endswith('.pdf')]
results = process_timesheet_batch([f"timesheets/{f}" for f in pdf_files])
# Filter successful extractions
successful = [r for r in results if r.get('success') and r.get('confidence', 0) >= 70]
print(f"Processed {len(successful)}/{len(results)} timesheets successfully")Problem: Extraction confidence below 70%
Solutions:
- Ensure document is high quality (300+ DPI for scans)
- Check that document is not rotated or skewed
- Verify document contains expected data
- Try using
gpt-4oinstead ofgpt-4o-minifor better accuracy
Problem: Some work shifts not extracted
Solutions:
- Add to custom prompt: "Extract ALL work shifts, including partial days"
- Check
explanationfield for details - Verify document format matches expected structure
- Ensure shifts are clearly visible in the document
Problem: Total hours don't match individual entries
Solutions:
- Specify timezone in custom prompt
- Verify time format in source document (AM/PM vs 24-hour)
- Check for overlapping shifts
- Validate date boundaries (shifts crossing midnight)
Problem: Typed document classified as handwritten
Solutions:
- Add to prompt: "Classify as handwritten only if text is clearly handwritten"
- Check OCR confidence scores
- Manually override if needed
- gpt-4o-mini: ~$0.005-0.02 per document (good for simple typed timesheets)
- gpt-4o: ~$0.01-0.05 per document (better for handwritten or complex documents)
# Use mini for typed documents
if document_type == "typed":
model = "gpt-4o-mini"
else:
model = "gpt-4o"Results are cached for 24 hours. Avoid re-processing the same document:
import hashlib
def get_document_hash(file_path: str) -> str:
"""Generate hash for document caching."""
with open(file_path, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
# Check cache before processing
doc_hash = get_document_hash("timesheet.pdf")
if doc_hash in processed_cache:
return processed_cache[doc_hash]