Csv Match

Shell Script Uploaded Feb 16, 2026 by pagetelegram 0 views 25.09 KB
New flags: - -m (match) — output only rows where two columns have the same value - -n (no-match) — output only rows where two columns have different values Command-line usage: ./csv_match.sh -m 3 5 data.csv # rows where column 3 == column 5 ./csv_match.sh -n 3 5 data.csv # rows where column 3 != column 5 Interactive mode now presents a mode selection menu: 1. Split — original behavior (split by unique values) 2. Match — keep rows where two columns are equal 3. No-match — keep rows where two columns differ Output files are named descriptively, e.g. data_match_State_eq_Region.csv or data_nomatch_State_ne_Region.csv, and the script reports how many rows were kept vs skipped. New features Progress bar (draw_progress helper): - Both split_csv and filter_csv now display a live progress bar on stderr - Updates every 1,000 rows: [████████████████░░░░░░░░░░░░░░] 53% (530000/1000000 rows) - Row counts are shown in the status header for context Multi-core parallel processing (-j N flag): - Command line: -j 4 to use 4 cores, e.g. ./csv_match.sh -j 4 -m 3 5 data.csv - Interactive mode: prompts CPU cores to use (1-N) [1]: after mode selection, auto-detecting available cores via nproc - Default is 1 (single-core, preserving original behavior) How parallelism works - The input file body (minus header) is split into N chunks via split -l - Each chunk is processed by its own background awk worker - Workers report progress to individual files; a background monitor polls them every 0.2s and draws the combined progress bar - Results are merged in chunk order to preserve row ordering - For split_csv: each worker writes per-value files into its own temp directory, then results are merged by value across all workers - For filter_csv: each worker produces a filtered chunk, then all chunks are concatenated Safety - cleanup trap on EXIT kills any orphaned monitor process and removes temp directories - Empty file edge cases (0 data rows) are handled gracefully - -j value is validated as a positive integer This latest version avoids using tmp partition as that can get exhausted causing premature exits.
research csv parellel
SH csv_match.sh
#!/usr/bin/env bash
#
# csv_match.sh — Split a CSV file into separate files based on unique values
#                in a specified column. Each output file contains rows sharing
#                the same value in that column, with column 2 as the values column.
#
# Output filenames:  <column_header>_<value>.csv
#
# Usage (command-line):
#   ./csv_match.sh <input.csv> <column_number>
#   ./csv_match.sh -d <input.csv> <column_number>   # each file in its own subfolder
#   ./csv_match.sh -m <col_a> <col_b> <input.csv>   # only rows where col_a == col_b
#   ./csv_match.sh -n <col_a> <col_b> <input.csv>   # only rows where col_a != col_b
#   ./csv_match.sh -j 4 -m <col_a> <col_b> <input.csv>   # use 4 CPU cores
#
# Usage (interactive):
#   ./csv_match.sh
#
# Flags:
#   -d    Place each output CSV in its own subfolder named after the file prefix
#   -m    Match mode: output only rows where two columns have the same value
#   -n    No-match mode: output only rows where two columns differ
#   -j N  Use N CPU cores for parallel processing (default: 1)
#

set -euo pipefail

# ── cleanup ──────────────────────────────────────────────────────────────────

_PARALLEL_TMP=""
_MONITOR_PID=""

cleanup() {
    if [[ -n "$_MONITOR_PID" ]]; then
        kill "$_MONITOR_PID" 2>/dev/null || true
        wait "$_MONITOR_PID" 2>/dev/null || true
        _MONITOR_PID=""
    fi
    if [[ -n "$_PARALLEL_TMP" && -d "$_PARALLEL_TMP" ]]; then
        rm -rf "$_PARALLEL_TMP"
        _PARALLEL_TMP=""
    fi
}
trap cleanup EXIT

# ── helpers ──────────────────────────────────────────────────────────────────

die() { echo "ERROR: $*" >&2; exit 1; }

detect_cores() {
    nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1
}

# Draw a progress bar to stderr
# Usage: draw_progress <current> <total>
draw_progress() {
    local current=$1 total=$2
    local i width=40
    if (( total == 0 )); then return; fi
    if (( current > total )); then current=$total; fi
    local pct=$((current * 100 / total))
    local filled=$((current * width / total))
    local empty=$((width - filled))
    local bar=""
    for ((i = 0; i < filled; i++)); do bar+="█"; done
    for ((i = 0; i < empty; i++)); do bar+="░"; done
    printf '\r  [%s] %3d%% (%d/%d rows)' "$bar" "$pct" "$current" "$total" >&2
}

# Background monitor that polls worker progress files and draws the bar
# Usage: progress_monitor <tmp_dir> <total_rows> <num_workers>
progress_monitor() {
    local tmp_dir="$1" total="$2" num_workers="$3"
    local i sum val
    while true; do
        sum=0
        for ((i = 0; i < num_workers; i++)); do
            val=$(cat "$tmp_dir/progress_$i" 2>/dev/null) || val=0
            [[ "$val" =~ ^[0-9]+$ ]] || val=0
            sum=$((sum + val))
        done
        draw_progress "$sum" "$total"
        sleep 0.2
    done
}

show_columns() {
    local file="$1"
    echo ""
    echo "Available columns in '$file':"
    echo "─────────────────────────────────────────"
    # Read header and print each column with its number
    IFS=',' read -r -a headers < "$file"
    for i in "${!headers[@]}"; do
        printf "  %3d : %s\n" "$((i + 1))" "${headers[$i]}"
    done
    echo "─────────────────────────────────────────"
    echo ""
}

# ── core functions ───────────────────────────────────────────────────────────

split_csv() {
    local input_file="$1"
    local split_col="$2"        # 1-based column number to split on
    local use_subdirs="$3"       # "yes" to place each file in its own subfolder
    local num_cores="$4"
    local values_col=2           # column 2 holds the values

    [[ -f "$input_file" ]] || die "File not found: $input_file"

    # Read the header
    local header
    header=$(head -n 1 "$input_file")
    IFS=',' read -r -a header_arr <<< "$header"

    local num_cols=${#header_arr[@]}

    # Validate column number
    if (( split_col < 1 || split_col > num_cols )); then
        die "Column $split_col is out of range (1–$num_cols)."
    fi

    local col_name="${header_arr[$((split_col - 1))]}"
    # Strip any trailing carriage return from the column name
    col_name="${col_name%$'\r'}"

    local val_col_name="${header_arr[$((values_col - 1))]}"
    val_col_name="${val_col_name%$'\r'}"

    # Create output directory
    local out_dir="${input_file%.csv}_split"
    mkdir -p "$out_dir"

    local total_lines
    total_lines=$(( $(wc -l < "$input_file") - 1 ))

    echo ""
    echo "Splitting '$input_file' ($total_lines rows)"
    echo "  Split column : $split_col ($col_name)"
    echo "  Values column: $values_col ($val_col_name)"
    echo "  Output dir   : $out_dir/"
    echo "  CPU cores    : $num_cores"
    [[ "$use_subdirs" == "yes" ]] && echo "  Subfolders   : enabled (-d)"
    echo ""

    if (( total_lines == 0 )); then
        echo "  No data rows to process."
        echo "Done."
        return
    fi

    if (( num_cores <= 1 )); then
        # ── Single-core with progress bar ──────────────────────────────
        awk -F',' -v col="$split_col" \
                  -v vcol="$values_col" \
                  -v col_name="$col_name" \
                  -v val_col_name="$val_col_name" \
                  -v out_dir="$out_dir" \
                  -v use_subdirs="$use_subdirs" \
                  -v total="$total_lines" \
        'BEGIN { OFS="," }
        NR == 1 {
            full_header = $0
            mini_header = col_name "," val_col_name
            next
        }
        {
            val = $col
            gsub(/\r/, "", val)
            safe_val = val
            gsub(/[^A-Za-z0-9._-]/, "_", safe_val)

            prefix = col_name "_" safe_val

            if (use_subdirs == "yes") {
                subdir = out_dir "/" prefix
                outfile = subdir "/" prefix ".csv"
            } else {
                outfile = out_dir "/" prefix ".csv"
            }

            if (!(val in seen)) {
                seen[val] = 1
                if (use_subdirs == "yes") {
                    system("mkdir -p \"" subdir "\"")
                }
                print full_header > outfile
                count++
            }

            print $0 >> outfile

            # Progress bar
            row = NR - 1
            if (row % 1000 == 0 || row == total) {
                pct = int(row * 100 / total)
                filled = int(row * 40 / total)
                bar = ""
                for (j = 0; j < filled; j++) bar = bar "\xe2\x96\x88"
                for (j = filled; j < 40; j++) bar = bar "\xe2\x96\x91"
                printf "\r  [%s] %3d%% (%d/%d rows)", bar, pct, row, total > "/dev/stderr"
                fflush("/dev/stderr")
            }
        }
        END {
            printf "\n" > "/dev/stderr"
            printf "  Created %d files for %d unique values in column \"%s\".\n", count, count, col_name
        }' "$input_file"
    else
        # ── Multi-core with progress ───────────────────────────────────
        local tmp_dir
        tmp_dir="${out_dir}/.tmp_parallel_$$"
        mkdir -p "$tmp_dir"
        _PARALLEL_TMP="$tmp_dir"

        # Split body into chunks
        local chunk_size=$(( (total_lines + num_cores - 1) / num_cores ))
        tail -n +2 "$input_file" | split -l "$chunk_size" -a 4 - "$tmp_dir/chunk_"

        local chunks=( "$tmp_dir"/chunk_* )
        local num_chunks=${#chunks[@]}

        # Initialize progress files
        local i
        for ((i = 0; i < num_chunks; i++)); do
            echo 0 > "$tmp_dir/progress_$i"
        done

        # Start progress monitor
        progress_monitor "$tmp_dir" "$total_lines" "$num_chunks" &
        _MONITOR_PID=$!

        # Process each chunk in parallel
        local pids=()
        for i in "${!chunks[@]}"; do
            local worker_dir="$tmp_dir/worker_$i"
            mkdir -p "$worker_dir"
            (
                awk -F',' -v col="$split_col" \
                          -v col_name="$col_name" \
                          -v worker_dir="$worker_dir" \
                          -v pfile="$tmp_dir/progress_$i" \
                'BEGIN { OFS="," }
                {
                    val = $col
                    gsub(/\r/, "", val)
                    safe_val = val
                    gsub(/[^A-Za-z0-9._-]/, "_", safe_val)
                    prefix = col_name "_" safe_val
                    outfile = worker_dir "/" prefix ".csv"

                    print $0 >> outfile

                    if (NR % 1000 == 0) {
                        print NR > pfile
                        close(pfile)
                    }
                }
                END {
                    print NR > pfile
                    close(pfile)
                }' "${chunks[$i]}"
            ) &
            pids+=($!)
        done

        # Wait for all workers
        for pid in "${pids[@]}"; do
            wait "$pid"
        done

        # Stop monitor and show 100%
        kill "$_MONITOR_PID" 2>/dev/null || true
        wait "$_MONITOR_PID" 2>/dev/null || true
        _MONITOR_PID=""
        draw_progress "$total_lines" "$total_lines"
        echo "" >&2

        # Merge results from all workers
        # Collect unique output filenames across all workers
        declare -A all_prefixes
        for ((i = 0; i < num_chunks; i++)); do
            local worker_dir="$tmp_dir/worker_$i"
            if [[ -d "$worker_dir" ]]; then
                for f in "$worker_dir"/*.csv; do
                    [[ -f "$f" ]] || continue
                    local bname
                    bname=$(basename "$f")
                    all_prefixes["$bname"]=1
                done
            fi
        done

        local count=0
        for prefix_file in "${!all_prefixes[@]}"; do
            local final_prefix="${prefix_file%.csv}"
            local final_file
            if [[ "$use_subdirs" == "yes" ]]; then
                mkdir -p "$out_dir/$final_prefix"
                final_file="$out_dir/$final_prefix/$prefix_file"
            else
                final_file="$out_dir/$prefix_file"
            fi

            # Write header once
            echo "$header" > "$final_file"

            # Append data from each worker
            for ((i = 0; i < num_chunks; i++)); do
                local worker_file="$tmp_dir/worker_$i/$prefix_file"
                if [[ -f "$worker_file" ]]; then
                    cat "$worker_file" >> "$final_file"
                fi
            done

            count=$((count + 1))
        done

        echo "  Created $count files for $count unique values in column \"$col_name\"."

        rm -rf "$tmp_dir"
        _PARALLEL_TMP=""
    fi

    echo ""
    echo "Output files:"
    find "$out_dir" -name '*.csv' -type f | sort | while read -r f; do
        rows=$(( $(wc -l < "$f") - 1 ))
        # Show path relative to out_dir
        rel="${f#"$out_dir"/}"
        printf "  %-60s  (%d rows)\n" "$rel" "$rows"
    done
    echo ""
    echo "Done."
}

filter_csv() {
    local input_file="$1"
    local col_a="$2"           # 1-based column number
    local col_b="$3"           # 1-based column number
    local mode="$4"            # "match" or "nomatch"
    local num_cores="$5"

    [[ -f "$input_file" ]] || die "File not found: $input_file"

    # Read the header
    local header
    header=$(head -n 1 "$input_file")
    IFS=',' read -r -a header_arr <<< "$header"

    local num_cols=${#header_arr[@]}

    # Validate column numbers
    if (( col_a < 1 || col_a > num_cols )); then
        die "Column $col_a is out of range (1–$num_cols)."
    fi
    if (( col_b < 1 || col_b > num_cols )); then
        die "Column $col_b is out of range (1–$num_cols)."
    fi

    local col_a_name="${header_arr[$((col_a - 1))]}"
    col_a_name="${col_a_name%$'\r'}"
    local col_b_name="${header_arr[$((col_b - 1))]}"
    col_b_name="${col_b_name%$'\r'}"

    # Build output filename
    local base="${input_file%.csv}"
    local out_file
    if [[ "$mode" == "match" ]]; then
        out_file="${base}_match_${col_a_name}_eq_${col_b_name}.csv"
    else
        out_file="${base}_nomatch_${col_a_name}_ne_${col_b_name}.csv"
    fi
    # Sanitise filename
    out_file=$(echo "$out_file" | sed 's/[^A-Za-z0-9._\/-]/_/g')

    local total_lines
    total_lines=$(( $(wc -l < "$input_file") - 1 ))

    local label
    [[ "$mode" == "match" ]] && label="==" || label="!="

    echo ""
    echo "Filtering '$input_file' ($total_lines rows)"
    echo "  Condition : column $col_a ($col_a_name) $label column $col_b ($col_b_name)"
    echo "  Output    : $out_file"
    echo "  CPU cores : $num_cores"
    echo ""

    if (( total_lines == 0 )); then
        echo "$header" > "$out_file"
        echo "  No data rows to process."
        echo "Done."
        return
    fi

    if (( num_cores <= 1 )); then
        # ── Single-core with progress bar ──────────────────────────────
        awk -F',' -v col_a="$col_a" -v col_b="$col_b" -v mode="$mode" \
            -v total="$total_lines" \
        'BEGIN { OFS=","; kept=0; skipped=0 }
        NR == 1 {
            print $0
            next
        }
        {
            a = $col_a; gsub(/\r/, "", a)
            b = $col_b; gsub(/\r/, "", b)

            if (mode == "match" && a == b) {
                print $0; kept++
            } else if (mode == "nomatch" && a != b) {
                print $0; kept++
            } else {
                skipped++
            }

            row = NR - 1
            if (row % 1000 == 0 || row == total) {
                pct = int(row * 100 / total)
                filled = int(row * 40 / total)
                bar = ""
                for (j = 0; j < filled; j++) bar = bar "\xe2\x96\x88"
                for (j = filled; j < 40; j++) bar = bar "\xe2\x96\x91"
                printf "\r  [%s] %3d%% (%d/%d rows)", bar, pct, row, total > "/dev/stderr"
                fflush("/dev/stderr")
            }
        }
        END {
            printf "\n" > "/dev/stderr"
            printf "  Kept %d rows, skipped %d rows.\n", kept, skipped | "cat >&2"
        }' "$input_file" > "$out_file"
    else
        # ── Multi-core with progress ───────────────────────────────────
        local tmp_dir
        local base_dir
        base_dir="$(dirname "$input_file")"
        tmp_dir="${base_dir}/.tmp_filter_$$"
        mkdir -p "$tmp_dir"
        _PARALLEL_TMP="$tmp_dir"

        # Split body into chunks
        local chunk_size=$(( (total_lines + num_cores - 1) / num_cores ))
        tail -n +2 "$input_file" | split -l "$chunk_size" -a 4 - "$tmp_dir/chunk_"

        local chunks=( "$tmp_dir"/chunk_* )
        local num_chunks=${#chunks[@]}

        # Initialize progress files
        local i
        for ((i = 0; i < num_chunks; i++)); do
            echo 0 > "$tmp_dir/progress_$i"
        done

        # Start progress monitor
        progress_monitor "$tmp_dir" "$total_lines" "$num_chunks" &
        _MONITOR_PID=$!

        # Process chunks in parallel
        local pids=()
        for i in "${!chunks[@]}"; do
            (
                awk -F',' -v col_a="$col_a" -v col_b="$col_b" -v mode="$mode" \
                    -v pfile="$tmp_dir/progress_$i" \
                'BEGIN { OFS="," }
                {
                    a = $col_a; gsub(/\r/, "", a)
                    b = $col_b; gsub(/\r/, "", b)

                    if (mode == "match" && a == b) print $0
                    else if (mode == "nomatch" && a != b) print $0

                    if (NR % 1000 == 0) {
                        print NR > pfile
                        close(pfile)
                    }
                }
                END {
                    print NR > pfile
                    close(pfile)
                }' "${chunks[$i]}" > "$tmp_dir/result_$i"
            ) &
            pids+=($!)
        done

        # Wait for all workers
        for pid in "${pids[@]}"; do
            wait "$pid"
        done

        # Stop monitor and show 100%
        kill "$_MONITOR_PID" 2>/dev/null || true
        wait "$_MONITOR_PID" 2>/dev/null || true
        _MONITOR_PID=""
        draw_progress "$total_lines" "$total_lines"
        echo "" >&2

        # Merge results
        echo "$header" > "$out_file"
        for i in "${!chunks[@]}"; do
            cat "$tmp_dir/result_$i" >> "$out_file"
        done

        local kept=$(( $(wc -l < "$out_file") - 1 ))
        local skipped=$(( total_lines - kept ))
        echo "  Kept $kept rows, skipped $skipped rows." >&2

        rm -rf "$tmp_dir"
        _PARALLEL_TMP=""
    fi

    echo ""
    echo "Output: $out_file ($(( $(wc -l < "$out_file") - 1 )) data rows)"
    echo "Done."
}

# ── main ─────────────────────────────────────────────────────────────────────

USE_SUBDIRS="no"
FILTER_MODE=""
NUM_CORES=1

# Parse flags
while getopts ":dmnj:" opt; do
    case $opt in
        d) USE_SUBDIRS="yes" ;;
        m) FILTER_MODE="match" ;;
        n) FILTER_MODE="nomatch" ;;
        j) NUM_CORES="$OPTARG" ;;
        *) die "Unknown option: -$OPTARG" ;;
    esac
done
shift $((OPTIND - 1))

# Validate -j value
[[ "$NUM_CORES" =~ ^[0-9]+$ ]] || die "-j requires a positive integer."
(( NUM_CORES >= 1 )) || die "-j requires a positive integer (got $NUM_CORES)."

if [[ -n "$FILTER_MODE" ]]; then
    # ── Filter mode (command-line) ───────────────────────────────────────
    if [[ $# -ge 3 ]]; then
        COL_A="$1"
        COL_B="$2"
        INPUT_FILE="$3"

        [[ "$COL_A" =~ ^[0-9]+$ ]] || die "Column A must be a positive integer."
        [[ "$COL_B" =~ ^[0-9]+$ ]] || die "Column B must be a positive integer."

        filter_csv "$INPUT_FILE" "$COL_A" "$COL_B" "$FILTER_MODE" "$NUM_CORES"

    elif [[ $# -eq 0 ]]; then
        # ── Filter mode (interactive) ────────────────────────────────────
        echo "╔═══════════════════════════════════════╗"
        if [[ "$FILTER_MODE" == "match" ]]; then
            echo "║   CSV Filter — Match (col == col)     ║"
        else
            echo "║   CSV Filter — No-Match (col != col)  ║"
        fi
        echo "╚═══════════════════════════════════════╝"
        echo ""

        csv_files=( *.csv )
        if [[ ${#csv_files[@]} -eq 0 || "${csv_files[0]}" == "*.csv" ]]; then
            die "No CSV files found in the current directory."
        fi

        echo "CSV files in current directory:"
        for i in "${!csv_files[@]}"; do
            printf "  %3d : %s\n" "$((i + 1))" "${csv_files[$i]}"
        done
        echo ""

        read -rp "Select a file (number or filename): " file_choice

        if [[ "$file_choice" =~ ^[0-9]+$ ]]; then
            idx=$((file_choice - 1))
            if (( idx < 0 || idx >= ${#csv_files[@]} )); then
                die "Invalid selection."
            fi
            INPUT_FILE="${csv_files[$idx]}"
        else
            INPUT_FILE="$file_choice"
        fi

        [[ -f "$INPUT_FILE" ]] || die "File not found: $INPUT_FILE"

        show_columns "$INPUT_FILE"

        read -rp "Enter the first column number to compare: " COL_A
        [[ "$COL_A" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

        read -rp "Enter the second column number to compare: " COL_B
        [[ "$COL_B" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

        # Prompt for CPU cores
        DETECTED_CORES=$(detect_cores)
        echo ""
        read -rp "CPU cores to use (1-$DETECTED_CORES) [1]: " core_choice
        if [[ -n "$core_choice" ]]; then
            [[ "$core_choice" =~ ^[0-9]+$ ]] || die "Core count must be a positive integer."
            (( core_choice >= 1 && core_choice <= DETECTED_CORES )) || die "Core count must be between 1 and $DETECTED_CORES."
            NUM_CORES="$core_choice"
        fi

        filter_csv "$INPUT_FILE" "$COL_A" "$COL_B" "$FILTER_MODE" "$NUM_CORES"
    else
        die "Usage: $0 -m|-n [-j N] <col_a> <col_b> <input.csv>"
    fi

elif [[ $# -ge 2 ]]; then
    # ── Split mode (command-line) ────────────────────────────────────────
    INPUT_FILE="$1"
    SPLIT_COL="$2"

    [[ "$SPLIT_COL" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

    split_csv "$INPUT_FILE" "$SPLIT_COL" "$USE_SUBDIRS" "$NUM_CORES"

elif [[ $# -eq 0 ]]; then
    # ── Interactive mode ─────────────────────────────────────────────────
    echo "╔═══════════════════════════════════════╗"
    echo "║      CSV Splitter — Interactive       ║"
    echo "╚═══════════════════════════════════════╝"
    echo ""

    # List CSV files in the current directory
    csv_files=( *.csv )
    if [[ ${#csv_files[@]} -eq 0 || "${csv_files[0]}" == "*.csv" ]]; then
        die "No CSV files found in the current directory."
    fi

    echo "CSV files in current directory:"
    for i in "${!csv_files[@]}"; do
        printf "  %3d : %s\n" "$((i + 1))" "${csv_files[$i]}"
    done
    echo ""

    read -rp "Select a file (number or filename): " file_choice

    # Determine if they entered a number or a filename
    if [[ "$file_choice" =~ ^[0-9]+$ ]]; then
        idx=$((file_choice - 1))
        if (( idx < 0 || idx >= ${#csv_files[@]} )); then
            die "Invalid selection."
        fi
        INPUT_FILE="${csv_files[$idx]}"
    else
        INPUT_FILE="$file_choice"
    fi

    [[ -f "$INPUT_FILE" ]] || die "File not found: $INPUT_FILE"

    show_columns "$INPUT_FILE"

    # Ask what mode to use
    echo "Modes:"
    echo "  1 : Split — split file by unique values in a column"
    echo "  2 : Match — keep rows where two columns are equal"
    echo "  3 : No-match — keep rows where two columns differ"
    echo ""
    read -rp "Select mode (1/2/3): " mode_choice

    # Prompt for CPU cores
    DETECTED_CORES=$(detect_cores)
    echo ""
    read -rp "CPU cores to use (1-$DETECTED_CORES) [1]: " core_choice
    if [[ -n "$core_choice" ]]; then
        [[ "$core_choice" =~ ^[0-9]+$ ]] || die "Core count must be a positive integer."
        (( core_choice >= 1 && core_choice <= DETECTED_CORES )) || die "Core count must be between 1 and $DETECTED_CORES."
        NUM_CORES="$core_choice"
    fi

    case "$mode_choice" in
        1)
            read -rp "Enter the column number to split on: " SPLIT_COL
            [[ "$SPLIT_COL" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

            if [[ "$USE_SUBDIRS" != "yes" ]]; then
                read -rp "Place each CSV in its own subfolder? (y/N): " subdir_choice
                [[ "$subdir_choice" =~ ^[Yy] ]] && USE_SUBDIRS="yes"
            fi

            split_csv "$INPUT_FILE" "$SPLIT_COL" "$USE_SUBDIRS" "$NUM_CORES"
            ;;
        2)
            read -rp "Enter the first column number to compare: " COL_A
            [[ "$COL_A" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."
            read -rp "Enter the second column number to compare: " COL_B
            [[ "$COL_B" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

            filter_csv "$INPUT_FILE" "$COL_A" "$COL_B" "match" "$NUM_CORES"
            ;;
        3)
            read -rp "Enter the first column number to compare: " COL_A
            [[ "$COL_A" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."
            read -rp "Enter the second column number to compare: " COL_B
            [[ "$COL_B" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

            filter_csv "$INPUT_FILE" "$COL_A" "$COL_B" "nomatch" "$NUM_CORES"
            ;;
        *)
            die "Invalid mode selection."
            ;;
    esac

else
    echo "Usage:"
    echo "  Split        : $0 [-d] [-j N] <input.csv> <column_number>"
    echo "  Match filter : $0 [-j N] -m <col_a> <col_b> <input.csv>"
    echo "  No-match     : $0 [-j N] -n <col_a> <col_b> <input.csv>"
    echo "  Interactive  : $0"
    echo ""
    echo "  -d     Place each output CSV in its own subfolder"
    echo "  -m     Keep only rows where col_a == col_b"
    echo "  -n     Keep only rows where col_a != col_b"
    echo "  -j N   Use N CPU cores for parallel processing (default: 1)"
    exit 1
fi
Download SH

← Back to SH