Script to drop large commands from database for performance

Mic92 · February 11, 2025, 11:51am

I recently had the issue that atuin got quite slow to search.
Using this statistic script I found that I had a few entries that were quite large:

import sqlite3
from collections import Counter
import os

def create_histogram(data, bins=30, width=60):
    """Create an ASCII histogram

    Args:
        data: List of values
        bins: Number of bins
        width: Width of the histogram in characters
    """
    # Calculate min and max for bin ranges
    min_val, max_val = min(data), max(data)
    bin_size = (max_val - min_val) / bins

    # Create bins
    counts = Counter()
    for value in data:
        bin_index = int((value - min_val) / bin_size)
        if bin_index == bins:  # Handle edge case for max value
            bin_index -= 1
        counts[bin_index] += 1

    # Find maximum count for scaling
    max_count = max(counts.values()) if counts else 0

    # Print histogram
    for i in range(bins):
        bin_start = min_val + i * bin_size
        bin_end = bin_start + bin_size
        count = counts[i]

        # Calculate bar length
        bar_length = int((count / max_count) * width) if max_count > 0 else 0

        # Print bar
        print(f"{bin_start:6.1f} - {bin_end:6.1f} | {'#' * bar_length} ({count})")

def format_command(command, max_length=80):
    """Format command for display with length limit and ellipsis"""
    if len(command) > max_length:
        return command[:max_length-3] + "..."
    return command

def main():
    # Use Atuin's default database path
    db_path = os.path.expanduser("~/.local/share/atuin/history.db")

    # Connect to database
    conn = sqlite3.connect(db_path)
    try:
        cursor = conn.cursor()

        # Query command lengths
        cursor.execute("SELECT LENGTH(command), command FROM history WHERE deleted_at IS NULL")
        results = cursor.fetchall()

        if not results:
            print("No commands found in database")
            return

        command_lengths = [row[0] for row in results]

        # Print statistics
        print("\nAtuin Command History Analysis")
        print("-" * 70)
        print(f"Total commands: {len(command_lengths):,}")
        print(f"Min length: {min(command_lengths)} characters")
        print(f"Max length: {max(command_lengths)} characters")
        print(f"Average length: {sum(command_lengths)/len(command_lengths):.1f} characters")
        print("\nLength Distribution Histogram:")
        print("-" * 70)

        # Create and display histogram
        create_histogram(command_lengths)

        # Print longest commands
        print("\nTop 10 Longest Commands:")
        print("-" * 70)

        # Query and sort longest commands
        cursor.execute("""
            SELECT command, LENGTH(command) as len
            FROM history
            WHERE deleted_at IS NULL
            ORDER BY len DESC
            LIMIT 10
        """)

        longest_commands = cursor.fetchall()
        for i, (command, length) in enumerate(longest_commands, 1):
            formatted_command = format_command(command)
            print(f"{i:2d}. [{length:4d}] {formatted_command}")

    except sqlite3.Error as e:
        print(f"Database error: {e}")
        print(f"Make sure Atuin is installed and the database exists at: {db_path}")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        conn.close()

if __name__ == "__main__":
    main()

Afterwards I wrote a script that removes duplicates commands from history as well as large commands:

import sqlite3
import os


def get_file_size(path):
    return os.path.getsize(path) / (1024 * 1024)  # Convert to MB


def deduplicate_history() -> None:
    path = os.path.expanduser("~/.local/share/atuin/history.db")

    # Get initial size
    initial_size = get_file_size(path)
    print(f"Initial database size: {initial_size:.2f} MB")

    # Connect to the database
    conn = sqlite3.connect(path)
    cursor = conn.cursor()

    try:
        # Get initial record count
        cursor.execute("SELECT COUNT(*) FROM history")
        initial_records = cursor.fetchone()[0]

        # Begin transaction
        cursor.execute("BEGIN TRANSACTION")

        # Delete duplicates keeping the most recent entry
        cursor.execute("""
            DELETE FROM history
            WHERE id IN (
                SELECT h1.id
                FROM history h1
                JOIN (
                    SELECT command, cwd, MAX(timestamp) as max_timestamp
                    FROM history
                    GROUP BY command, cwd
                    HAVING COUNT(*) > 1
                ) h2
                ON h1.command = h2.command
                AND h1.cwd = h2.cwd
                WHERE h1.timestamp < h2.max_timestamp
            )
        """)
        duplicates_removed = cursor.rowcount

        # Delete commands longer than 1000 bytes
        cursor.execute("""
            DELETE FROM history
            WHERE LENGTH(command) > 1000
        """)
        long_commands_removed = cursor.rowcount

        # Commit changes
        conn.commit()

        # Get size before vacuum
        pre_vacuum_size = get_file_size(path)

        # Vacuum the database
        conn.execute("VACUUM")

        # Get final statistics
        cursor.execute("SELECT COUNT(*) FROM history")
        final_records = cursor.fetchone()[0]
        final_size = get_file_size(path)

        print("\nOperation Summary:")
        print(f"- Duplicate commands removed: {duplicates_removed}")
        print(f"- Long commands removed: {long_commands_removed}")
        print(f"- Records: {initial_records} → {final_records}")
        print("\nSize Changes:")
        print(f"- Initial size: {initial_size:.2f} MB")
        print(f"- Pre-vacuum size: {pre_vacuum_size:.2f} MB")
        print(f"- Final size: {final_size:.2f} MB")
        print(
            f"- Total space saved: {(initial_size - final_size):.2f} MB ({((initial_size - final_size) / initial_size * 100):.1f}%)"
        )

    except sqlite3.Error as e:
        print(f"An error occurred: {e}")
        conn.rollback()
    finally:
        conn.close()


if __name__ == "__main__":
    deduplicate_history()

This brought the size down from 50M to 25M and my atuin search is significantly more snappy. You may not need to drop duplicates. In my case the most gain was by dropping large commands itself.

Mic92 · February 11, 2025, 11:55am

Next question, how can discard the remote database and use my local state?