Ocrwrits
BASH Scripts
SH
ocrwrits.sh
#!/bin/bash
# Batch OCR Processing Script - Recursive File Processing with GPU Support
# Version: 3.0.0
# Last Updated: 2025-12-25
# Author: Assistant
# License: MIT
#
# Description: Recursively processes all PDF/TIF/JPG/PNG files and extracts text using OCR optimized for handwritten/script text, with NVIDIA GPU support
# Dependencies: tesseract-ocr, imagemagick (with OpenCL), poppler-utils, bc, file, nvidia-cuda-toolkit (optional)
# Usage: ./ocrwrits.sh [output_dir]
# Continue processing even if some files fail
set +e
# Global variables
USE_GPU=0
# Function to check and update ImageMagick policy
check_imagemagick_policy() {
local policy_file
local policy_files=(
"/etc/ImageMagick-6/policy.xml"
"/etc/ImageMagick/policy.xml"
"/usr/local/etc/ImageMagick-6/policy.xml"
"/usr/local/etc/ImageMagick/policy.xml"
)
for file in "${policy_files[@]}"; do
if [ -f "$file" ]; then
policy_file="$file"
break
fi
done
if [ -z "$policy_file" ]; then
log_message "ERROR" "ImageMagick policy file not found"
return 1
fi
log_message "INFO" "Found ImageMagick policy file: $policy_file"
if [ ! -w "$policy_file" ]; then
log_message "ERROR" "No write permission for policy file. Please run with sudo."
return 1
fi
local backup_file="${policy_file}.backup"
if [ ! -f "$backup_file" ]; then
log_message "INFO" "Creating backup of policy file..."
cp "$policy_file" "$backup_file"
fi
local needs_update=0
local temp_policy=$(mktemp)
while IFS= read -r line; do
if [[ $line =~ "policy domain=\"coder\" rights=\"none\" pattern=\"PDF\"" ]]; then
line=" <policy domain=\"coder\" rights=\"read|write\" pattern=\"PDF\" />"
needs_update=1
elif [[ $line =~ "policy domain=\"coder\" rights=\"none\" pattern=\"PNG\"" ]]; then
line=" <policy domain=\"coder\" rights=\"read|write\" pattern=\"PNG\" />"
needs_update=1
elif [[ $line =~ "policy domain=\"coder\" rights=\"none\" pattern=\"LABEL\"" ]]; then
line=" <policy domain=\"coder\" rights=\"read|write\" pattern=\"LABEL\" />"
needs_update=1
fi
if [[ $line =~ "<domain resource=\"memory\"" ]]; then
line=" <domain resource=\"memory\" value=\"4GiB\"/>"
needs_update=1
fi
if [[ $line =~ "<domain resource=\"map\"" ]]; then
line=" <domain resource=\"map\" value=\"4GiB\"/>"
needs_update=1
fi
if [[ $line =~ "<domain resource=\"disk\"" ]]; then
line=" <domain resource=\"disk\" value=\"8GiB\"/>"
needs_update=1
fi
echo "$line" >> "$temp_policy"
done < "$policy_file"
if [ $needs_update -eq 1 ]; then
log_message "INFO" "Updating ImageMagick policy with GPU-friendly limits..."
mv "$temp_policy" "$policy_file"
chmod 644 "$policy_file"
log_message "INFO" "ImageMagick policy updated successfully"
else
log_message "INFO" "ImageMagick policy is already correctly configured"
rm "$temp_policy"
fi
return 0
}
# Function to log messages with timestamps
log_message() {
local level=$1
local message=$2
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message"
}
# Function to validate output directory
validate_output_dir() {
local dir="$1"
local output_dir
if [[ "$dir" == */* ]]; then
output_dir=$(dirname "$dir")
else
output_dir="."
fi
if [ ! -d "$output_dir" ]; then
log_message "INFO" "Output directory '$output_dir' does not exist. Creating..."
if ! mkdir -p "$output_dir"; then
log_message "ERROR" "Failed to create output directory"
return 1
fi
fi
if [ ! -w "$output_dir" ]; then
log_message "ERROR" "Output directory '$output_dir' is not writable"
return 1
fi
local available_space
available_space=$(df -BM "$output_dir" | awk 'NR==2 {print $4}' | tr -d 'M')
if [ "$available_space" -lt 2048 ]; then
log_message "ERROR" "Insufficient disk space in output directory (less than 2GB available)"
return 1
fi
return 0
}
# Function to process a single file (PDF or image)
# Returns: 0 = success, 1 = failed, 2 = skipped (no text)
process_file() {
local input_file="$1"
local output_text="$2"
local temp_dir="$3"
# Clear temp directory contents
rm -rf "${temp_dir}/"{images,enhanced,processed}/*
mkdir -p "${temp_dir}/"{images,enhanced,processed}
local file_ext="${input_file##*.}"
file_ext=$(echo "$file_ext" | tr '[:upper:]' '[:lower:]')
# Initialize text output
> "$output_text"
if [[ "$file_ext" == "pdf" ]]; then
# Process PDF
if ! pdfinfo "$input_file" >/dev/null 2>&1; then
log_message "ERROR" "Invalid PDF file: $input_file"
return 1
fi
local num_pages=$(pdfinfo "$input_file" | grep Pages | awk '{print $2}')
for (( page=1; page <= num_pages; page++ )); do
base_image="${temp_dir}/images/page_${page}"
if ! pdftoppm -f $page -l $page -png -r 300 "$input_file" "$base_image" 2>/dev/null; then
continue
fi
image_file=$(ls "${base_image}"-*.png 2>/dev/null | head -n 1)
if [ -z "$image_file" ] || [ ! -f "$image_file" ]; then
continue
fi
# Enhance and OCR
enhanced_file="${temp_dir}/enhanced/page_${page}.png"
if ! enhance_image "$image_file" "$enhanced_file"; then
continue
fi
if ! tesseract "$enhanced_file" \
"${temp_dir}/processed/page_$page" \
-l eng \
--oem 1 \
--psm 6 \
--dpi 300 \
txt 2>/dev/null; then
continue
fi
if [ -f "${temp_dir}/processed/page_${page}.txt" ]; then
cat "${temp_dir}/processed/page_${page}.txt" >> "$output_text"
fi
done
else
# Process image file (TIF, JPG, PNG)
enhanced_file="${temp_dir}/enhanced/image.png"
if ! enhance_image "$input_file" "$enhanced_file"; then
log_message "ERROR" "Failed to enhance image: $input_file"
return 1
fi
if ! tesseract "$enhanced_file" \
"${temp_dir}/processed/output" \
-l eng \
--oem 1 \
--psm 6 \
--dpi 300 \
txt 2>/dev/null; then
log_message "ERROR" "OCR failed for: $input_file"
return 1
fi
if [ -f "${temp_dir}/processed/output.txt" ]; then
cat "${temp_dir}/processed/output.txt" >> "$output_text"
fi
fi
# Check if output contains meaningful text
if ! has_text_content "$output_text" 3; then
# No meaningful text found - likely a photograph
rm -f "$output_text"
return 2
fi
return 0
}
# Function to enhance image for OCR
enhance_image() {
local input_image="$1"
local output_image="$2"
if [ "$USE_GPU" -eq 1 ]; then
if ! convert -limit memory 4GiB -limit map 4GiB "$input_image" \
-resize 150% \
-unsharp 1.5x1+0.7+0.02 \
-threshold 60% \
-noise 1 \
-normalize \
-density 300 \
-depth 8 \
-strip \
-background white \
-alpha remove \
-colorspace gray \
"$output_image" 2>/dev/null; then
return 1
fi
else
if ! convert -limit memory 2GiB -limit map 2GiB "$input_image" \
-resize 120% \
-unsharp 1x1+0.5+0.01 \
-threshold 50% \
-normalize \
-density 300 \
-depth 8 \
-strip \
-background white \
-alpha remove \
-colorspace gray \
"$output_image" 2>/dev/null; then
return 1
fi
fi
return 0
}
# Function to detect CPU cores
detect_cores() {
local total_cores
# Try different methods to detect cores
if [ -f /proc/cpuinfo ]; then
total_cores=$(grep -c ^processor /proc/cpuinfo)
elif command -v nproc >/dev/null 2>&1; then
total_cores=$(nproc)
elif command -v sysctl >/dev/null 2>&1; then
total_cores=$(sysctl -n hw.ncpu 2>/dev/null || echo 4)
else
total_cores=4
fi
# Use all cores except 2, minimum of 1
local cores_to_use=$((total_cores - 2))
if [ $cores_to_use -lt 1 ]; then
cores_to_use=1
fi
echo "$cores_to_use"
}
# Function to check dependencies and GPU support
check_dependencies() {
local missing_deps=()
local deps=(
"tesseract"
"convert"
"pdftoppm"
"pdfinfo"
"bc"
"file"
)
for dep in "${deps[@]}"; do
if ! command -v "$dep" >/dev/null 2>&1; then
missing_deps+=("$dep")
fi
done
if [ ${#missing_deps[@]} -ne 0 ]; then
log_message "ERROR" "Missing dependencies: ${missing_deps[*]}"
log_message "INFO" "Please install the required dependencies:"
log_message "INFO" "sudo apt-get install tesseract-ocr imagemagick poppler-utils bc file"
return 1
fi
if ! tesseract --version | grep -q "tesseract 4\|tesseract 5"; then
log_message "WARNING" "Tesseract version < 4.0 detected. Handwritten OCR works best with 4.0+ (LSTM engine)."
fi
if command -v nvidia-smi >/dev/null 2>&1; then
log_message "INFO" "NVIDIA GPU detected: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)"
if convert -version | grep -q "OpenCL"; then
log_message "INFO" "ImageMagick compiled with OpenCL support - GPU acceleration enabled"
USE_GPU=1
else
log_message "WARNING" "ImageMagick lacks OpenCL support. Install a GPU-enabled version for better performance."
USE_GPU=0
fi
else
log_message "WARNING" "No NVIDIA GPU detected. Running on CPU only."
USE_GPU=0
fi
return 0
}
# Function to update progress counter (thread-safe)
increment_progress() {
local progress_file="$1"
local lock_file="${progress_file}.lock"
# Acquire lock
while ! mkdir "$lock_file" 2>/dev/null; do
sleep 0.01
done
# Increment counter
local count=0
if [ -f "$progress_file" ]; then
count=$(cat "$progress_file")
fi
count=$((count + 1))
echo "$count" > "$progress_file"
# Release lock
rmdir "$lock_file"
echo "$count"
}
# Function to display progress bar with time estimation
show_progress_monitor() {
local total=$1
local start_time=$2
local progress_file="$3"
local width=50
while true; do
local current=0
if [ -f "$progress_file" ]; then
current=$(cat "$progress_file" 2>/dev/null || echo 0)
fi
if [ $current -ge $total ]; then
break
fi
local percentage=$((current * 100 / total))
local filled=$((width * current / total))
local empty=$((width - filled))
local current_time=$(date +%s)
local elapsed=$((current_time - start_time))
local avg_time_per_file=0
local est_remaining=0
if [ $current -gt 0 ]; then
avg_time_per_file=$(bc <<< "scale=2; $elapsed / $current")
est_remaining=$(bc <<< "scale=2; $avg_time_per_file * ($total - $current)")
fi
local elapsed_str=$(date -u -d @${elapsed} +"%H:%M:%S" 2>/dev/null || echo "00:00:00")
local remaining_str=$(date -u -d @${est_remaining} +"%H:%M:%S" 2>/dev/null || echo "00:00:00")
printf "\rProgress: ["
printf "%${filled}s" | tr ' ' '#'
printf "%${empty}s" | tr ' ' '-'
printf "] %3d%% | Files: %d/%d | Elapsed: %s | Remaining: %s | Avg: %.1fs/file " \
"$percentage" "$current" "$total" \
"$elapsed_str" "$remaining_str" "$avg_time_per_file"
sleep 1
done
# Final update
printf "\rProgress: ["
printf "%${width}s" | tr ' ' '#'
printf "] 100%% | Files: %d/%d | Complete! \n" \
"$total" "$total"
}
# Function to convert file path to output filename
path_to_filename() {
local filepath="$1"
local output_dir="$2"
# Remove leading ./ if present
filepath="${filepath#./}"
# Replace / with _ and get base name without extension
local base=$(echo "$filepath" | sed 's|/|_|g' | sed 's/\.[^.]*$//')
echo "${output_dir}/${base}.txt"
}
# Function to validate file (PDF or image)
validate_file() {
local file="$1"
if [ ! -f "$file" ] || [ ! -r "$file" ]; then
log_message "ERROR" "File '$file' does not exist or is not readable"
return 1
fi
return 0
}
# Function to check if text contains meaningful words
has_text_content() {
local text_file="$1"
local min_words="${2:-3}" # Minimum words required, default 3
if [ ! -f "$text_file" ] || [ ! -s "$text_file" ]; then
return 1
fi
# Count words (alphanumeric sequences)
local word_count=$(grep -oE '[[:alnum:]]{2,}' "$text_file" 2>/dev/null | wc -l)
if [ "$word_count" -ge "$min_words" ]; then
return 0
else
return 1
fi
}
# Worker function for parallel processing
process_file_worker() {
local input_file="$1"
local output_dir="$2"
local progress_file="$3"
local success_file="$4"
local failed_file="$5"
local skipped_file="$6"
local temp_base_dir="$7"
# Create unique temp directory for this worker
local worker_temp="${temp_base_dir}/worker_$$_${RANDOM}"
mkdir -p "${worker_temp}/"{images,enhanced,processed}
# Validate file
if ! validate_file "$input_file"; then
echo "1" >> "$failed_file"
increment_progress "$progress_file" >/dev/null
rm -rf "$worker_temp"
return 1
fi
# Generate output filename
local output_text=$(path_to_filename "$input_file" "$output_dir")
# Process file (USE_GPU is already exported globally)
process_file "$input_file" "$output_text" "$worker_temp"
local result=$?
if [ $result -eq 0 ]; then
# Success - file has text content
echo "1" >> "$success_file"
elif [ $result -eq 2 ]; then
# Skipped - no text content (photograph)
echo "1" >> "$skipped_file"
else
# Failed - processing error
echo "1" >> "$failed_file"
fi
# Update progress
increment_progress "$progress_file" >/dev/null
# Cleanup worker temp directory
rm -rf "$worker_temp"
return 0
}
# Parse arguments
OUTPUT_DIR="${1:-.}"
# Check dependencies
log_message "INFO" "Checking dependencies..."
if ! check_dependencies; then
exit 1
fi
# Detect CPU cores
CORES=$(detect_cores)
log_message "INFO" "Using $CORES CPU core(s) for parallel processing"
# Export functions and variables for parallel execution
export -f process_file_worker
export -f process_file
export -f enhance_image
export -f validate_file
export -f path_to_filename
export -f increment_progress
export -f log_message
export -f has_text_content
export USE_GPU
# Check and update ImageMagick policy (suppress errors if not running as root)
log_message "INFO" "Checking ImageMagick policy..."
check_imagemagick_policy 2>/dev/null || log_message "WARNING" "Could not update ImageMagick policy. You may need to run with sudo if processing fails."
# Validate output directory
log_message "INFO" "Validating output directory..."
if ! validate_output_dir "$OUTPUT_DIR"; then
exit 1
fi
# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Find all files recursively
log_message "INFO" "Searching for files recursively..."
mapfile -t FILES < <(find . -type f \( -iname "*.pdf" -o -iname "*.tif" -o -iname "*.tiff" -o -iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" \) | sort)
total_files=${#FILES[@]}
if [ $total_files -eq 0 ]; then
log_message "ERROR" "No PDF, TIF, JPG, or PNG files found in current directory or subdirectories"
exit 1
fi
log_message "INFO" "Found $total_files file(s) to process"
# Create temp directory
TEMP_DIR="./ocr_temp_$(date +%Y%m%d_%H%M%S)"
mkdir -p "${TEMP_DIR}"
# Create progress tracking files
PROGRESS_FILE="${TEMP_DIR}/progress.txt"
SUCCESS_FILE="${TEMP_DIR}/success.txt"
FAILED_FILE="${TEMP_DIR}/failed.txt"
SKIPPED_FILE="${TEMP_DIR}/skipped.txt"
echo "0" > "$PROGRESS_FILE"
> "$SUCCESS_FILE"
> "$FAILED_FILE"
> "$SKIPPED_FILE"
# Cleanup on exit
trap 'rm -rf "${TEMP_DIR}"' EXIT
# Start progress monitor in background
start_time=$(date +%s)
show_progress_monitor "$total_files" "$start_time" "$PROGRESS_FILE" &
PROGRESS_PID=$!
# Process all files in parallel
log_message "INFO" "Starting parallel OCR processing..."
# Use xargs for parallel execution
printf '%s\n' "${FILES[@]}" | xargs -P "$CORES" -I {} bash -c "process_file_worker '{}' '$OUTPUT_DIR' '$PROGRESS_FILE' '$SUCCESS_FILE' '$FAILED_FILE' '$SKIPPED_FILE' '$TEMP_DIR'" || true
# Wait for progress monitor to complete
wait "$PROGRESS_PID" 2>/dev/null || true
# Calculate results
successful_files=$(wc -l < "$SUCCESS_FILE" 2>/dev/null || echo 0)
failed_files=$(wc -l < "$FAILED_FILE" 2>/dev/null || echo 0)
skipped_files=$(wc -l < "$SKIPPED_FILE" 2>/dev/null || echo 0)
end_time=$(date +%s)
total_time=$((end_time - start_time))
total_time_str=$(date -u -d @${total_time} +"%H:%M:%S" 2>/dev/null || echo "00:00:00")
avg_time_per_file=$(bc <<< "scale=2; $total_time / $total_files" 2>/dev/null || echo "0")
log_message "INFO" "Processing complete:"
log_message "INFO" "- Successfully processed files: $successful_files/$total_files"
log_message "INFO" "- Skipped files (no text/photographs): $skipped_files"
log_message "INFO" "- Failed files: $failed_files"
log_message "INFO" "- Total processing time: $total_time_str"
log_message "INFO" "- Average time per file: ${avg_time_per_file}s"
log_message "INFO" "- Cores used: $CORES"
log_message "INFO" "- Output directory: $OUTPUT_DIR"
if [ $successful_files -eq 0 ]; then
log_message "WARNING" "No files with text content were found"
if [ $skipped_files -gt 0 ]; then
log_message "INFO" "$skipped_files files were skipped (likely photographs with no text)"
fi
exit 1
fi