macFilenameSort

#!/usr/bin/env perl -w
#
# macFilenameSort: Sort filenames considering numeric parts.
# 2007-09-07: Written by Steven J. DeRose.
#
use strict;
use Getopt::Long;

use sjdUtils;
#use alogging;

our %metadata = (
    "title"        => "macFilenameSort",
    "description"  => "Sort filenames considering numeric parts.",
    "rightsHolder" => "Steven J. DeRose",
    "creator"      => "http://viaf.org/viaf/50334488",
    "type"         => "http://purl.org/dc/dcmitype/Software",
    "language"     => "Perl 5.18",
    "created"      => "2007-09-07",
    "modified"     => "2021-11-08",
    "publisher"    => "http://github.com/sderose",
    "license"      => "https://creativecommons.org/licenses/by-sa/3.0/"
);
our $VERSION_DATE = $metadata{"modified"};


=pod

=head1 Usage

macFilenameSort

Sort a series of lines from stdin, like the way Mac OS X sorts filenames.
This amounts to separating numeric and non-numeric portions, and treating
them as a series of separate fields, sorted appropriately for each one's type
(numeric or alphabetic). 

Only decimal integers are recognized, and they are considered to precede other characters.

For example,
    file_9
    file_10
    dir_13
    dir_10000
    
sorts to:
    dir_13
    dir_10000
    file_9
    file_10

whereas as regular *nix C<ls -1> produces:
    dir_10000
    dir_13
    file_10
    file_9

To apply this to C<ls>, do:
    ls | macFilenameSort

or
    macFilenameSort *
    
Using C<ls>, C<find>, etc. is more flexible than just globbing, for example because
you can choose whether to include hidden ("."-initial) items, etc.

Use I<--foldersFirst> to keep directories at the top.

You can pipe onward to my C<wrap> to format back into multiple columns if desired.


=head1 Options

=over

=item * B<--ignorecase> OR B<-i>

Fold non-numeric portions to lower case for comparison.

=item * B<-F>

Like C<ls -F>, append type-flags to names: directories ("/"),
excutables ("*"), links ("@"), sockets (`='), after each whiteouts (`%'), or  FIFOs (`|').

=item * B<--foldersFirst>

Sort directories/folders to the top.

=item * B<--maxFields> I<n>

Change the limit on the number of fields into which a line (or filename) can
be divided. Default: 255.

=item * B<--quiet> OR B<-q>

Suppress most messages.

=item *B<--verbose> OR B<-v>

Add more detailed messages.

=item * B<--version>

Display version info and exit.

=back


=head1 Known bugs and limitations

Doesn't handle non-integers. 3.14 will be treated as 3, ".", and 14.

Dies if a line has more than I<--maxFields> (numeric and string) portions.

Is this stable or not?

=head1 To do

Perhaps allow Finder-like sort choices: name, kind, dates, size, tags?


=head1 Related commands

*nix C<find>, C<ls>, C<sort>, C<msort>,....

My C<PowerWalk.py>.


=head1 History

  2007-09-07: Written by Steven J. DeRose.
  2012-09-12: Clean up.
  2015-03-27: Fix pod. Use sjdUtils. Add -i. Separate comparison.
Change from padding numerics to fixed-width, to true multi-key sort.
  2021-08-31: New layout.
  2021-11-08: Add --foldersFirst, -F. Allow file-list as args as well as STDIN.


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see L<http://creativecommons.org/licenses/by-sa/3.0/>.

For the most recent version, see L<http://www.derose.net/steve/utilities/>.


=cut


###############################################################################
#
my $F            = 0;
my $foldersFirst = 0;
my $ignoreCase   = 1;
my $maxFields    = 255;
my $quiet        = 0;
my $verbose      = 0;

# Process options
Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
    "F!"                => \$F,
    "foldersFirst!"     => \$foldersFirst,
	"h|help|?"          => sub { system "perldoc $0"; exit; },
	"i|ignoreCase!"     => \$ignoreCase,
	"maxFields=i"       => \$maxFields,
	"q|quiet!"          => \$quiet,
	"v|verbose+"        => \$verbose,
    "version"           => sub {
		die "Version of $VERSION_DATE, by Steven J. DeRose.\n";
	}
	);

($result) || die "Bad options.\n";


###############################################################################
#Split a string into numeric vs. alphabetic parts
#
sub splitFields {
    my ($rec) = @_;
    my @fields = split(/(\d+)/, $rec);
    return \@fields, scalar(@fields);
}

# Sort an array, with each entry containing [ record, [keyFields], nKeyFields ].
# The 'keyfields' is a list of (typically alternating) numeric and alphabetic parts,
# plus an optional first entry for whether the item is a directory.
#
sub compareArrays {
    if (ref($a) ne "ARRAY" || ref($b) ne "ARRAY") {
        die "compareArrays didn't get 2 refs to arrays: " . ref($a) .
            ", " . ref($b) . ".\n";
    }
    my @keysa = @{$a->[1]};
    my $nkeysa = scalar @keysa;
    my @keysb = @{$b->[1]};
    my $nkeysb = scalar @keysb;
    my $maxFound = ($nkeysa > $nkeysb) ? $nkeysa : $nkeysb;
    
    my $kfa = my $kfb = my $rc = undef;
    for (my $i=0; $i<$maxFields && $i<$maxFound; $i++) {
        $kfa = ($i < $nkeysa) ? $keysa[$i] : undef;
        $kfb = ($i < $nkeysb) ? $keysb[$i] : undef;
        if (!defined $kfa) { return( (defined $kfb) ? -1:0 ); }
        $rc = compareByType($kfa, $kfb);
        if ($rc != 0) { return($rc); }
    }
    die sprintf("compareArrays: Too many fields in key: a %d (%s), b %d (%s), max %d.\n",
        $nkeysa, join(", ", @keysa), $nkeysb, join(", ", @keysb),$maxFound);
}

# See whether we're looking at ints, strings, or one of each, and sort accordingly.
#
sub compareByType {
    my ($a, $b) = @_;
    if (sjdUtils::isInteger($a)) {
        if (sjdUtils::isInteger($b)) { return($a<=>$b); }
        return(-1);
    }
    else {
        if (sjdUtils::isInteger($b)) { return(1); }
        if ($ignoreCase) { return( lc($a) cmp lc($b)); }
        return($a cmp $b);
    }
}

# Return the flag character that "ls -F" would append to a file (if any).
#
sub getFlag {
    my ($path, $F) = @_;
    (!$F) && return "";  # Option was not even set
    (-d $path) && return "/";  # directories ("/"),
    (-x $path) && return "*";  # excutables ("*"), 
    (-l $path) && return "@";  # links ("@"),            # aka -h or -L elsewhere
    (-S $path) && return "=";  # sockets (`='), 
    (-d $path) && return "%";  # whiteouts (`%'),        # TODO: How to test?
    (-p $path) && return "|";  # FIFOs (`|').
    return "";
}

    
###############################################################################
# Main
#
# Read all the input or args.
#
my @items = ();
if (scalar @ARGV) {
    @items = @ARGV;
}
else {
    while (my $rec = <>) {
        chomp $rec;
        push @items, $rec;
    }
}

# Extract first token of each, parse at digit-groups.
#
my @records = ();  # Array of [ record, [keys] ] pairs
my %nRecsByNFields = ();
foreach my $rec (@items) {
    # Split into numeric vs. alphabetic parts
    my ($fieldsRef, $nFields) = splitFields($rec);
    if ($foldersFirst) {
        my $isDir = 0;
        if (-d $rec) { $isDir = 1; }
        unshift @{$fieldsRef}, 1 - $isDir;
        $nFields++;
    }
    ($verbose) && warn sprintf("|%d] [ %s ]: '%s'\n", $nFields, join(", ", @{$fieldsRef}), $rec);
    $nRecsByNFields{$nFields} ++;
    push @records, [ $rec, $fieldsRef ];
}

my @srecords = sort compareArrays @records;

for (my $r=0; $r<scalar @srecords; $r++) {
    print ($srecords[$r][0] . getFlag($srecords[$r][0], $F) . "\n");
}