csv_split.sh

Shell Script Uploaded Feb 16, 2026 by pagetelegram 1 views 7.42 KB
This lists CSV files in the directory, lets you pick one, shows all columns with numbers, and prompts for the split column. What it does: - Reads the CSV header to get column names - Splits all rows by unique values in the specified column (column x) - Column 2 is treated as the values column - Each output file gets the full header + matching rows - Output files are named <ColumnName>_<value>.csv (e.g., HCPCS_CODE_T1019.csv) - Files are saved to a <inputname>_split/ directory - Reports row counts per output file when done Command-line mode: ./csv_split.sh <input.csv> <column_number> # Example: ./csv_split.sh medicaid.csv 3
research BASH csv split
SH csv_split.sh
#!/usr/bin/env bash
#
# csv_split.sh — Split a CSV file into separate files based on unique values
#                in a specified column. Each output file contains rows sharing
#                the same value in that column, with column 2 as the values column.
#
# Output filenames:  <column_header>_<value>.csv
#
# Usage (command-line):
#   ./csv_split.sh <input.csv> <column_number>
#   ./csv_split.sh -d <input.csv> <column_number>   # each file in its own subfolder
#
# Usage (interactive):
#   ./csv_split.sh
#
# Flags:
#   -d    Place each output CSV in its own subfolder named after the file prefix
#

set -euo pipefail

# ── helpers ──────────────────────────────────────────────────────────────────

die() { echo "ERROR: $*" >&2; exit 1; }

show_columns() {
    local file="$1"
    echo ""
    echo "Available columns in '$file':"
    echo "─────────────────────────────────────────"
    # Read header and print each column with its number
    IFS=',' read -r -a headers < "$file"
    for i in "${!headers[@]}"; do
        printf "  %3d : %s\n" "$((i + 1))" "${headers[$i]}"
    done
    echo "─────────────────────────────────────────"
    echo ""
}

split_csv() {
    local input_file="$1"
    local split_col="$2"        # 1-based column number to split on
    local use_subdirs="$3"       # "yes" to place each file in its own subfolder
    local values_col=2           # column 2 holds the values

    [[ -f "$input_file" ]] || die "File not found: $input_file"

    # Read the header
    local header
    header=$(head -n 1 "$input_file")
    IFS=',' read -r -a header_arr <<< "$header"

    local num_cols=${#header_arr[@]}

    # Validate column number
    if (( split_col < 1 || split_col > num_cols )); then
        die "Column $split_col is out of range (1–$num_cols)."
    fi

    local col_name="${header_arr[$((split_col - 1))]}"
    # Strip any trailing carriage return from the column name
    col_name="${col_name%$'\r'}"

    local val_col_name="${header_arr[$((values_col - 1))]}"
    val_col_name="${val_col_name%$'\r'}"

    # Create output directory
    local out_dir="${input_file%.csv}_split"
    mkdir -p "$out_dir"

    echo ""
    echo "Splitting '$input_file'"
    echo "  Split column : $split_col ($col_name)"
    echo "  Values column: $values_col ($val_col_name)"
    echo "  Output dir   : $out_dir/"
    [[ "$use_subdirs" == "yes" ]] && echo "  Subfolders   : enabled (-d)"
    echo ""

    # Use awk to split the file
    # awk is 1-based for fields, which matches our convention
    awk -F',' -v col="$split_col" \
              -v vcol="$values_col" \
              -v col_name="$col_name" \
              -v val_col_name="$val_col_name" \
              -v out_dir="$out_dir" \
              -v use_subdirs="$use_subdirs" \
    'BEGIN { OFS="," }
    NR == 1 {
        # Store the full header for output files
        full_header = $0
        # Build a two-column header: split_col_name, values_col_name
        mini_header = col_name "," val_col_name
        next
    }
    {
        # Get the value in the split column
        val = $col
        # Strip carriage returns
        gsub(/\r/, "", val)
        # Sanitise the value for use as a filename
        safe_val = val
        gsub(/[^A-Za-z0-9._-]/, "_", safe_val)

        # Build the filename prefix
        prefix = col_name "_" safe_val

        if (use_subdirs == "yes") {
            # Place each CSV in its own subfolder named after the prefix
            subdir = out_dir "/" prefix
            outfile = subdir "/" prefix ".csv"
        } else {
            outfile = out_dir "/" prefix ".csv"
        }

        # Write header if this is a new file
        if (!(val in seen)) {
            seen[val] = 1
            if (use_subdirs == "yes") {
                system("mkdir -p \"" subdir "\"")
            }
            print full_header > outfile
            count++
        }

        # Write the data row
        print $0 >> outfile
    }
    END {
        printf "  Created %d files for %d unique values in column \"%s\".\n", count, count, col_name
    }' "$input_file"

    echo ""
    echo "Output files:"
    find "$out_dir" -name '*.csv' -type f | sort | while read -r f; do
        rows=$(( $(wc -l < "$f") - 1 ))
        # Show path relative to out_dir
        rel="${f#"$out_dir"/}"
        printf "  %-60s  (%d rows)\n" "$rel" "$rows"
    done
    echo ""
    echo "Done."
}

# ── main ─────────────────────────────────────────────────────────────────────

USE_SUBDIRS="no"

# Parse flags
while getopts ":d" opt; do
    case $opt in
        d) USE_SUBDIRS="yes" ;;
        *) die "Unknown option: -$OPTARG" ;;
    esac
done
shift $((OPTIND - 1))

if [[ $# -ge 2 ]]; then
    # ── Command-line mode ────────────────────────────────────────────────
    INPUT_FILE="$1"
    SPLIT_COL="$2"

    [[ "$SPLIT_COL" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

    split_csv "$INPUT_FILE" "$SPLIT_COL" "$USE_SUBDIRS"

elif [[ $# -eq 0 ]]; then
    # ── Interactive mode ─────────────────────────────────────────────────
    echo "╔═══════════════════════════════════════╗"
    echo "║      CSV Splitter — Interactive       ║"
    echo "╚═══════════════════════════════════════╝"
    echo ""

    # List CSV files in the current directory
    csv_files=( *.csv )
    if [[ ${#csv_files[@]} -eq 0 || "${csv_files[0]}" == "*.csv" ]]; then
        die "No CSV files found in the current directory."
    fi

    echo "CSV files in current directory:"
    for i in "${!csv_files[@]}"; do
        printf "  %3d : %s\n" "$((i + 1))" "${csv_files[$i]}"
    done
    echo ""

    read -rp "Select a file (number or filename): " file_choice

    # Determine if they entered a number or a filename
    if [[ "$file_choice" =~ ^[0-9]+$ ]]; then
        idx=$((file_choice - 1))
        if (( idx < 0 || idx >= ${#csv_files[@]} )); then
            die "Invalid selection."
        fi
        INPUT_FILE="${csv_files[$idx]}"
    else
        INPUT_FILE="$file_choice"
    fi

    [[ -f "$INPUT_FILE" ]] || die "File not found: $INPUT_FILE"

    show_columns "$INPUT_FILE"

    read -rp "Enter the column number to split on: " SPLIT_COL

    [[ "$SPLIT_COL" =~ ^[0-9]+$ ]] || die "Column number must be a positive integer."

    # Ask about subfolder mode if -d wasn't already set on the command line
    if [[ "$USE_SUBDIRS" != "yes" ]]; then
        read -rp "Place each CSV in its own subfolder? (y/N): " subdir_choice
        [[ "$subdir_choice" =~ ^[Yy] ]] && USE_SUBDIRS="yes"
    fi

    split_csv "$INPUT_FILE" "$SPLIT_COL" "$USE_SUBDIRS"

else
    echo "Usage:"
    echo "  Command-line : $0 [-d] <input.csv> <column_number>"
    echo "  Interactive  : $0 [-d]"
    echo ""
    echo "  -d   Place each output CSV in its own subfolder"
    exit 1
fi
Download SH
← Back to SH