Ocrwrits

Shell Script Uploaded Feb 16, 2026 by pagetelegram 2 views 18.25 KB
BASH Scripts
SH ocrwrits.sh
#!/bin/bash

# Batch OCR Processing Script - Recursive File Processing with GPU Support
# Version: 3.0.0
# Last Updated: 2025-12-25
# Author: Assistant
# License: MIT
#
# Description: Recursively processes all PDF/TIF/JPG/PNG files and extracts text using OCR optimized for handwritten/script text, with NVIDIA GPU support
# Dependencies: tesseract-ocr, imagemagick (with OpenCL), poppler-utils, bc, file, nvidia-cuda-toolkit (optional)
# Usage: ./ocrwrits.sh [output_dir]

# Continue processing even if some files fail
set +e

# Global variables
USE_GPU=0

# Function to check and update ImageMagick policy
check_imagemagick_policy() {
    local policy_file
    local policy_files=(
        "/etc/ImageMagick-6/policy.xml"
        "/etc/ImageMagick/policy.xml"
        "/usr/local/etc/ImageMagick-6/policy.xml"
        "/usr/local/etc/ImageMagick/policy.xml"
    )
    
    for file in "${policy_files[@]}"; do
        if [ -f "$file" ]; then
            policy_file="$file"
            break
        fi
    done
    
    if [ -z "$policy_file" ]; then
        log_message "ERROR" "ImageMagick policy file not found"
        return 1
    fi
    
    log_message "INFO" "Found ImageMagick policy file: $policy_file"
    
    if [ ! -w "$policy_file" ]; then
        log_message "ERROR" "No write permission for policy file. Please run with sudo."
        return 1
    fi
    
    local backup_file="${policy_file}.backup"
    if [ ! -f "$backup_file" ]; then
        log_message "INFO" "Creating backup of policy file..."
        cp "$policy_file" "$backup_file"
    fi
    
    local needs_update=0
    local temp_policy=$(mktemp)
    
    while IFS= read -r line; do
        if [[ $line =~ "policy domain=\"coder\" rights=\"none\" pattern=\"PDF\"" ]]; then
            line="  <policy domain=\"coder\" rights=\"read|write\" pattern=\"PDF\" />"
            needs_update=1
        elif [[ $line =~ "policy domain=\"coder\" rights=\"none\" pattern=\"PNG\"" ]]; then
            line="  <policy domain=\"coder\" rights=\"read|write\" pattern=\"PNG\" />"
            needs_update=1
        elif [[ $line =~ "policy domain=\"coder\" rights=\"none\" pattern=\"LABEL\"" ]]; then
            line="  <policy domain=\"coder\" rights=\"read|write\" pattern=\"LABEL\" />"
            needs_update=1
        fi
        if [[ $line =~ "<domain resource=\"memory\"" ]]; then
            line="  <domain resource=\"memory\" value=\"4GiB\"/>"
            needs_update=1
        fi
        if [[ $line =~ "<domain resource=\"map\"" ]]; then
            line="  <domain resource=\"map\" value=\"4GiB\"/>"
            needs_update=1
        fi
        if [[ $line =~ "<domain resource=\"disk\"" ]]; then
            line="  <domain resource=\"disk\" value=\"8GiB\"/>"
            needs_update=1
        fi
        echo "$line" >> "$temp_policy"
    done < "$policy_file"
    
    if [ $needs_update -eq 1 ]; then
        log_message "INFO" "Updating ImageMagick policy with GPU-friendly limits..."
        mv "$temp_policy" "$policy_file"
        chmod 644 "$policy_file"
        log_message "INFO" "ImageMagick policy updated successfully"
    else
        log_message "INFO" "ImageMagick policy is already correctly configured"
        rm "$temp_policy"
    fi
    
    return 0
}

# Function to log messages with timestamps
log_message() {
    local level=$1
    local message=$2
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message"
}

# Function to validate output directory
validate_output_dir() {
    local dir="$1"
    local output_dir
    if [[ "$dir" == */* ]]; then
        output_dir=$(dirname "$dir")
    else
        output_dir="."
    fi
    
    if [ ! -d "$output_dir" ]; then
        log_message "INFO" "Output directory '$output_dir' does not exist. Creating..."
        if ! mkdir -p "$output_dir"; then
            log_message "ERROR" "Failed to create output directory"
            return 1
        fi
    fi
    
    if [ ! -w "$output_dir" ]; then
        log_message "ERROR" "Output directory '$output_dir' is not writable"
        return 1
    fi
    
    local available_space
    available_space=$(df -BM "$output_dir" | awk 'NR==2 {print $4}' | tr -d 'M')
    if [ "$available_space" -lt 2048 ]; then
        log_message "ERROR" "Insufficient disk space in output directory (less than 2GB available)"
        return 1
    fi
    
    return 0
}

# Function to process a single file (PDF or image)
# Returns: 0 = success, 1 = failed, 2 = skipped (no text)
process_file() {
    local input_file="$1"
    local output_text="$2"
    local temp_dir="$3"

    # Clear temp directory contents
    rm -rf "${temp_dir}/"{images,enhanced,processed}/*
    mkdir -p "${temp_dir}/"{images,enhanced,processed}

    local file_ext="${input_file##*.}"
    file_ext=$(echo "$file_ext" | tr '[:upper:]' '[:lower:]')

    # Initialize text output
    > "$output_text"

    if [[ "$file_ext" == "pdf" ]]; then
        # Process PDF
        if ! pdfinfo "$input_file" >/dev/null 2>&1; then
            log_message "ERROR" "Invalid PDF file: $input_file"
            return 1
        fi

        local num_pages=$(pdfinfo "$input_file" | grep Pages | awk '{print $2}')

        for (( page=1; page <= num_pages; page++ )); do
            base_image="${temp_dir}/images/page_${page}"
            if ! pdftoppm -f $page -l $page -png -r 300 "$input_file" "$base_image" 2>/dev/null; then
                continue
            fi

            image_file=$(ls "${base_image}"-*.png 2>/dev/null | head -n 1)
            if [ -z "$image_file" ] || [ ! -f "$image_file" ]; then
                continue
            fi

            # Enhance and OCR
            enhanced_file="${temp_dir}/enhanced/page_${page}.png"
            if ! enhance_image "$image_file" "$enhanced_file"; then
                continue
            fi

            if ! tesseract "$enhanced_file" \
                "${temp_dir}/processed/page_$page" \
                -l eng \
                --oem 1 \
                --psm 6 \
                --dpi 300 \
                txt 2>/dev/null; then
                continue
            fi

            if [ -f "${temp_dir}/processed/page_${page}.txt" ]; then
                cat "${temp_dir}/processed/page_${page}.txt" >> "$output_text"
            fi
        done
    else
        # Process image file (TIF, JPG, PNG)
        enhanced_file="${temp_dir}/enhanced/image.png"
        if ! enhance_image "$input_file" "$enhanced_file"; then
            log_message "ERROR" "Failed to enhance image: $input_file"
            return 1
        fi

        if ! tesseract "$enhanced_file" \
            "${temp_dir}/processed/output" \
            -l eng \
            --oem 1 \
            --psm 6 \
            --dpi 300 \
            txt 2>/dev/null; then
            log_message "ERROR" "OCR failed for: $input_file"
            return 1
        fi

        if [ -f "${temp_dir}/processed/output.txt" ]; then
            cat "${temp_dir}/processed/output.txt" >> "$output_text"
        fi
    fi

    # Check if output contains meaningful text
    if ! has_text_content "$output_text" 3; then
        # No meaningful text found - likely a photograph
        rm -f "$output_text"
        return 2
    fi

    return 0
}

# Function to enhance image for OCR
enhance_image() {
    local input_image="$1"
    local output_image="$2"

    if [ "$USE_GPU" -eq 1 ]; then
        if ! convert -limit memory 4GiB -limit map 4GiB "$input_image" \
            -resize 150% \
            -unsharp 1.5x1+0.7+0.02 \
            -threshold 60% \
            -noise 1 \
            -normalize \
            -density 300 \
            -depth 8 \
            -strip \
            -background white \
            -alpha remove \
            -colorspace gray \
            "$output_image" 2>/dev/null; then
            return 1
        fi
    else
        if ! convert -limit memory 2GiB -limit map 2GiB "$input_image" \
            -resize 120% \
            -unsharp 1x1+0.5+0.01 \
            -threshold 50% \
            -normalize \
            -density 300 \
            -depth 8 \
            -strip \
            -background white \
            -alpha remove \
            -colorspace gray \
            "$output_image" 2>/dev/null; then
            return 1
        fi
    fi

    return 0
}

# Function to detect CPU cores
detect_cores() {
    local total_cores

    # Try different methods to detect cores
    if [ -f /proc/cpuinfo ]; then
        total_cores=$(grep -c ^processor /proc/cpuinfo)
    elif command -v nproc >/dev/null 2>&1; then
        total_cores=$(nproc)
    elif command -v sysctl >/dev/null 2>&1; then
        total_cores=$(sysctl -n hw.ncpu 2>/dev/null || echo 4)
    else
        total_cores=4
    fi

    # Use all cores except 2, minimum of 1
    local cores_to_use=$((total_cores - 2))
    if [ $cores_to_use -lt 1 ]; then
        cores_to_use=1
    fi

    echo "$cores_to_use"
}

# Function to check dependencies and GPU support
check_dependencies() {
    local missing_deps=()
    local deps=(
        "tesseract"
        "convert"
        "pdftoppm"
        "pdfinfo"
        "bc"
        "file"
    )

    for dep in "${deps[@]}"; do
        if ! command -v "$dep" >/dev/null 2>&1; then
            missing_deps+=("$dep")
        fi
    done

    if [ ${#missing_deps[@]} -ne 0 ]; then
        log_message "ERROR" "Missing dependencies: ${missing_deps[*]}"
        log_message "INFO" "Please install the required dependencies:"
        log_message "INFO" "sudo apt-get install tesseract-ocr imagemagick poppler-utils bc file"
        return 1
    fi

    if ! tesseract --version | grep -q "tesseract 4\|tesseract 5"; then
        log_message "WARNING" "Tesseract version < 4.0 detected. Handwritten OCR works best with 4.0+ (LSTM engine)."
    fi
    
    if command -v nvidia-smi >/dev/null 2>&1; then
        log_message "INFO" "NVIDIA GPU detected: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)"
        if convert -version | grep -q "OpenCL"; then
            log_message "INFO" "ImageMagick compiled with OpenCL support - GPU acceleration enabled"
            USE_GPU=1
        else
            log_message "WARNING" "ImageMagick lacks OpenCL support. Install a GPU-enabled version for better performance."
            USE_GPU=0
        fi
    else
        log_message "WARNING" "No NVIDIA GPU detected. Running on CPU only."
        USE_GPU=0
    fi
    
    return 0
}

# Function to update progress counter (thread-safe)
increment_progress() {
    local progress_file="$1"
    local lock_file="${progress_file}.lock"

    # Acquire lock
    while ! mkdir "$lock_file" 2>/dev/null; do
        sleep 0.01
    done

    # Increment counter
    local count=0
    if [ -f "$progress_file" ]; then
        count=$(cat "$progress_file")
    fi
    count=$((count + 1))
    echo "$count" > "$progress_file"

    # Release lock
    rmdir "$lock_file"

    echo "$count"
}

# Function to display progress bar with time estimation
show_progress_monitor() {
    local total=$1
    local start_time=$2
    local progress_file="$3"
    local width=50

    while true; do
        local current=0
        if [ -f "$progress_file" ]; then
            current=$(cat "$progress_file" 2>/dev/null || echo 0)
        fi

        if [ $current -ge $total ]; then
            break
        fi

        local percentage=$((current * 100 / total))
        local filled=$((width * current / total))
        local empty=$((width - filled))

        local current_time=$(date +%s)
        local elapsed=$((current_time - start_time))
        local avg_time_per_file=0
        local est_remaining=0

        if [ $current -gt 0 ]; then
            avg_time_per_file=$(bc <<< "scale=2; $elapsed / $current")
            est_remaining=$(bc <<< "scale=2; $avg_time_per_file * ($total - $current)")
        fi

        local elapsed_str=$(date -u -d @${elapsed} +"%H:%M:%S" 2>/dev/null || echo "00:00:00")
        local remaining_str=$(date -u -d @${est_remaining} +"%H:%M:%S" 2>/dev/null || echo "00:00:00")

        printf "\rProgress: ["
        printf "%${filled}s" | tr ' ' '#'
        printf "%${empty}s" | tr ' ' '-'
        printf "] %3d%% | Files: %d/%d | Elapsed: %s | Remaining: %s | Avg: %.1fs/file   " \
               "$percentage" "$current" "$total" \
               "$elapsed_str" "$remaining_str" "$avg_time_per_file"

        sleep 1
    done

    # Final update
    printf "\rProgress: ["
    printf "%${width}s" | tr ' ' '#'
    printf "] 100%% | Files: %d/%d | Complete!                                            \n" \
           "$total" "$total"
}

# Function to convert file path to output filename
path_to_filename() {
    local filepath="$1"
    local output_dir="$2"

    # Remove leading ./ if present
    filepath="${filepath#./}"

    # Replace / with _ and get base name without extension
    local base=$(echo "$filepath" | sed 's|/|_|g' | sed 's/\.[^.]*$//')

    echo "${output_dir}/${base}.txt"
}

# Function to validate file (PDF or image)
validate_file() {
    local file="$1"

    if [ ! -f "$file" ] || [ ! -r "$file" ]; then
        log_message "ERROR" "File '$file' does not exist or is not readable"
        return 1
    fi

    return 0
}

# Function to check if text contains meaningful words
has_text_content() {
    local text_file="$1"
    local min_words="${2:-3}"  # Minimum words required, default 3

    if [ ! -f "$text_file" ] || [ ! -s "$text_file" ]; then
        return 1
    fi

    # Count words (alphanumeric sequences)
    local word_count=$(grep -oE '[[:alnum:]]{2,}' "$text_file" 2>/dev/null | wc -l)

    if [ "$word_count" -ge "$min_words" ]; then
        return 0
    else
        return 1
    fi
}

# Worker function for parallel processing
process_file_worker() {
    local input_file="$1"
    local output_dir="$2"
    local progress_file="$3"
    local success_file="$4"
    local failed_file="$5"
    local skipped_file="$6"
    local temp_base_dir="$7"

    # Create unique temp directory for this worker
    local worker_temp="${temp_base_dir}/worker_$$_${RANDOM}"
    mkdir -p "${worker_temp}/"{images,enhanced,processed}

    # Validate file
    if ! validate_file "$input_file"; then
        echo "1" >> "$failed_file"
        increment_progress "$progress_file" >/dev/null
        rm -rf "$worker_temp"
        return 1
    fi

    # Generate output filename
    local output_text=$(path_to_filename "$input_file" "$output_dir")

    # Process file (USE_GPU is already exported globally)
    process_file "$input_file" "$output_text" "$worker_temp"
    local result=$?

    if [ $result -eq 0 ]; then
        # Success - file has text content
        echo "1" >> "$success_file"
    elif [ $result -eq 2 ]; then
        # Skipped - no text content (photograph)
        echo "1" >> "$skipped_file"
    else
        # Failed - processing error
        echo "1" >> "$failed_file"
    fi

    # Update progress
    increment_progress "$progress_file" >/dev/null

    # Cleanup worker temp directory
    rm -rf "$worker_temp"

    return 0
}

# Parse arguments
OUTPUT_DIR="${1:-.}"

# Check dependencies
log_message "INFO" "Checking dependencies..."
if ! check_dependencies; then
    exit 1
fi

# Detect CPU cores
CORES=$(detect_cores)
log_message "INFO" "Using $CORES CPU core(s) for parallel processing"

# Export functions and variables for parallel execution
export -f process_file_worker
export -f process_file
export -f enhance_image
export -f validate_file
export -f path_to_filename
export -f increment_progress
export -f log_message
export -f has_text_content
export USE_GPU

# Check and update ImageMagick policy (suppress errors if not running as root)
log_message "INFO" "Checking ImageMagick policy..."
check_imagemagick_policy 2>/dev/null || log_message "WARNING" "Could not update ImageMagick policy. You may need to run with sudo if processing fails."

# Validate output directory
log_message "INFO" "Validating output directory..."
if ! validate_output_dir "$OUTPUT_DIR"; then
    exit 1
fi

# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

# Find all files recursively
log_message "INFO" "Searching for files recursively..."
mapfile -t FILES < <(find . -type f \( -iname "*.pdf" -o -iname "*.tif" -o -iname "*.tiff" -o -iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" \) | sort)

total_files=${#FILES[@]}

if [ $total_files -eq 0 ]; then
    log_message "ERROR" "No PDF, TIF, JPG, or PNG files found in current directory or subdirectories"
    exit 1
fi

log_message "INFO" "Found $total_files file(s) to process"

# Create temp directory
TEMP_DIR="./ocr_temp_$(date +%Y%m%d_%H%M%S)"
mkdir -p "${TEMP_DIR}"

# Create progress tracking files
PROGRESS_FILE="${TEMP_DIR}/progress.txt"
SUCCESS_FILE="${TEMP_DIR}/success.txt"
FAILED_FILE="${TEMP_DIR}/failed.txt"
SKIPPED_FILE="${TEMP_DIR}/skipped.txt"
echo "0" > "$PROGRESS_FILE"
> "$SUCCESS_FILE"
> "$FAILED_FILE"
> "$SKIPPED_FILE"

# Cleanup on exit
trap 'rm -rf "${TEMP_DIR}"' EXIT

# Start progress monitor in background
start_time=$(date +%s)
show_progress_monitor "$total_files" "$start_time" "$PROGRESS_FILE" &
PROGRESS_PID=$!

# Process all files in parallel
log_message "INFO" "Starting parallel OCR processing..."

# Use xargs for parallel execution
printf '%s\n' "${FILES[@]}" | xargs -P "$CORES" -I {} bash -c "process_file_worker '{}' '$OUTPUT_DIR' '$PROGRESS_FILE' '$SUCCESS_FILE' '$FAILED_FILE' '$SKIPPED_FILE' '$TEMP_DIR'" || true

# Wait for progress monitor to complete
wait "$PROGRESS_PID" 2>/dev/null || true

# Calculate results
successful_files=$(wc -l < "$SUCCESS_FILE" 2>/dev/null || echo 0)
failed_files=$(wc -l < "$FAILED_FILE" 2>/dev/null || echo 0)
skipped_files=$(wc -l < "$SKIPPED_FILE" 2>/dev/null || echo 0)

end_time=$(date +%s)
total_time=$((end_time - start_time))
total_time_str=$(date -u -d @${total_time} +"%H:%M:%S" 2>/dev/null || echo "00:00:00")
avg_time_per_file=$(bc <<< "scale=2; $total_time / $total_files" 2>/dev/null || echo "0")

log_message "INFO" "Processing complete:"
log_message "INFO" "- Successfully processed files: $successful_files/$total_files"
log_message "INFO" "- Skipped files (no text/photographs): $skipped_files"
log_message "INFO" "- Failed files: $failed_files"
log_message "INFO" "- Total processing time: $total_time_str"
log_message "INFO" "- Average time per file: ${avg_time_per_file}s"
log_message "INFO" "- Cores used: $CORES"
log_message "INFO" "- Output directory: $OUTPUT_DIR"

if [ $successful_files -eq 0 ]; then
    log_message "WARNING" "No files with text content were found"
    if [ $skipped_files -gt 0 ]; then
        log_message "INFO" "$skipped_files files were skipped (likely photographs with no text)"
    fi
    exit 1
fi
Download SH

← Back to SH