#!/usr/bin/env perl
#
# Test suite for vw:
#
# You may add arbitrary (train/test/varying-options) tests
# by adding data files and their expected reference STDOUT and STDERR
#
# See __DATA__ below for how to add more tests
#
require 5.008;
use warnings;

use Getopt::Std;
use File::Basename;

use vars qw($opt_d $opt_D $opt_c $opt_e $opt_f
            $opt_E $opt_o $opt_w $opt_y $opt_t
            $opt_v $opt_V $opt_O);

my $Epsilon = 1e-4;

my $VW;

# External utilities we use. See init() for Windows specific actions.
my $Diff = 'diff';
my $Cat = 'cat';

$ENV{'PATH'} .= ':test:../vowpalwabbit:vowpalwabbit:.';

# -V prefixes valgrind like this, we should adjust the default
# options over time to what looks most useful.
my $Valgrind = 'valgrind --quiet --error-exitcode=100 --track-origins=yes --leak-check=full';

# -- timeout is part of GNU coreutils, some systems may not have it
my $TimeOut = '';
my $TimeOutSec = 80;    # max allowed time for single vw command run

# By default, we run all tests in the list
my $FullRun = 1;
my $ErrorCount = 0;

# These --side-by-side diff opts are used to make the
# fuzzy-compare easier: just split on '|' and compare numeric values
# word by word:
# NOTE: -W 160 is sufficient for most outputs.
#       --bfgs prints widest (134 chars-per-line)
my $DiffOpts = '-N --minimal --suppress-common-lines --ignore-all-space --strip-trailing-cr --side-by-side -W 160';
$WordSplit = "[ \t:]+";

# These diff options are used for the diff we want to show the user
# The intent is to make them easier to parse (and compare values) by a human
my $DisplayDiffOpts = '-u --minimal';

my @PathAdd = qw(. .. ../vowpalwabbit);

my @ToTest = ();

# __DATA__ test counter
my $TestNo = 0;

sub v($;@) {
    my $verbose_level = shift @_;
    return unless ($opt_v >= $verbose_level);
    if (@_ == 1) {
        print STDERR @_;
    } else {
        printf STDERR @_;
    }
}

sub usage(@) {
    print STDERR @_, "\n" if (@_);

    die "Usage: $0 [options] [testno...] [vw-executable]
    By default will run against the 1st 'vw' executable found in:
        @PathAdd  \$PATH

    Options:
        -c      print test-suite commands before running them
        -d      print diff output on significant diff failure
        -D      print diff output even if it is not significant
        -e      exit with non-zero status on first error
        -w      Ignore white-space differences (diff --ignore-space-change)
        -f      Ignore small (< $Epsilon) floating-point differences (fuzzy compare)
        -E<e>   Tolerance epsilon <e> for fuzzy compares (default $Epsilon)
        -o      Overwrite reference file with new/different result
        -y      On error, copy bad files to (eg stderr.test21) for later comparison
        -v<L>   Verbosity <L> (small integer) is verbosity level
        -V      apply valgrind to vw commands
        -t<T>   Apply timeout <T> (default $TimeOutSec) secs to individual tests
                (will only work where GNU coreutils 'timeout' is present)
        -O<O>   Add <O> option(s) to all vw commands

    [testno...]   Optional integer args: explicit test numbers (skip others)
";
}

sub mysystem {
    my $cmd = shift;
    v(1, "%s\n", $cmd);
    system($cmd);
}

sub command_failed($) {
    # Deal with cases where vw crashes, exits prematurely etc.
    # print a message to distinguish between all cases
    # return non-zero status if anything is bad
    my $cmd = shift;
    my $exitcode = 0;
    if ($?) {
        $exitcode = $? >> 8;
        my $signal = $? & 127;
        my $core = ''; if ($? & 128) { $core = ' (core dumped)'; }
        if ($signal) {
            printf STDERR
                "$0: test $TestNo: '%s' died from signal $signal$core\n", $cmd;
            $exitcode = 1;
        } elsif ($exitcode == 124) {
            printf STDERR
                "$0: test $TestNo: '%s' timed-out (exitcode=$exitcode)\n" .
                "$0: test $TestNo: you may increase the imposed time-out: \$TimeOutSec=%d\n",
                $cmd, $TimeOutSec;
        } elsif ($exitcode) {
            printf STDERR
                "$0: test $TestNo: '%s' failed (exitcode=$exitcode)\n", $cmd;
        }
    }
    # This is non-zero only if $cmd failed
    $exitcode;
}

sub valgrind_errfile($) {
    my $testno = shift;
    "Test-$testno.valgrind-err";
}

#
# which vw executable to test against
#
sub which_vw() {
    if (@ARGV > 0) {
        my $exe = $ARGV[0];
        if (-f $exe && -x $exe) {
            printf STDERR "Testing vw: %s\n", $exe;
            return $exe;
        } else {
            usage("$0: argument $exe: not an executable file");
        }
    } elsif (@ARGV == 0) {
        foreach my $dir (@PathAdd, split(':', $ENV{PATH})) {
            my $exe = "$dir/vw";
            if (-x $exe) {
                printf STDERR "Testing vw: %s\n", $exe;
                return $exe;
            }
        }
    }
    usage("can't find a 'vw' executable to test on");
}


sub init() {
    $0 =~ s{.*/}{};
    getopts('wcdDefyE:ov:Vt:O:') || usage();
    $opt_v = 0 unless (defined $opt_v and $opt_v);
    if (defined $opt_O) {
        $opt_O = " $opt_O";
    } else {
        $opt_O = '';
    }

    my $hostname = `hostname`; chomp($hostname);
    printf STDERR "Testing on: hostname=%s OS=%s\n", $hostname, $^O;

    if ($^O =~ /MSWin/i) {
        v(1, "OS is $^O\n");
        # On MS Windows we need to change paths to external executables
        # Assumes cygwin is installed
        $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin';
        # And just to be safe (probably not needed):
        $Diff  = 'c:\cygwin\bin\diff.exe';
        $Cat   = 'c:\cygwin\bin\cat.exe';
    }
    elsif ($^O =~ /cygwin/i){
        v(1,"OS is $^O\n");
        # On MS Windows we need to change paths to external executables
        # Assumes cygwin is installed
        $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin';
        # And just to be safe (probably not needed):
#        $Diff  = 'c:/cygwin/bin/diff.exe';
#        $Cat   = 'c:/cygwin/bin/cat.exe';
    }
    $Epsilon = $opt_E if ($opt_E);
    $Diff .= ' --ignore-space-change' if ($opt_w);
    my @num_args = ();
    my @exe_args = ();
    foreach my $arg (@ARGV) {
        if ($arg =~ /^\d+$/) {  # a test number
            push(@num_args, $arg);
            next;
        }
        push(@exe_args, $arg);
    }
    if (@num_args) {
        @ToTest = sort { $a <=> $b } @num_args;
        # add dummy element so we don't become empty on last test
        push(@ToTest, -1);
        $FullRun = 0;
    }
    @ARGV = @exe_args;

    $VW = which_vw();

    my $timeout = `which timeout 2>/dev/null`;
    if ($timeout =~ /timeout$/) {
        chomp($timeout);
        $TimeOut = $timeout;
        v(1,"timeout is: %s\n", $TimeOut);
    }
    if ($opt_t) {
        if ($opt_t =~ /^\d+$/) {
            $TimeOutSec = $opt_t;
        } else {
            usage("-t $opt_t: -t can only accept integer seconds");
        }
        warn "-t passed but this env doesn't have timeout installed\n"
            unless ($TimeOut);
    }
}

sub copy_file {
    my ($src_file, $dst_file) = @_;
    use File::Copy;
    print STDERR "\t\t-> copying output to $dst_file\n";
    copy($src_file, $dst_file);
}

sub trim_spaces($) {
    my $str = shift;
    $str =~ s/^\s+//s;
    $str =~ s/\s+$//s;
    $str =~ s/\n+$//s;
    $str;
}

#
# ref_file($default_name)
#   Reference file existence: if we're on Windows, AND
#   an alternate reference-file exists, give precedence
#   to the alternate file (file with a '-mswin' suffix.)
#
sub ref_file($) {
    my $file = shift;
    if ($^O =~ /MSWin/i or $^O =~ /cygwin/i) {
        my $win_reffile = "$file-mswin";
        if (-e $win_reffile) {
            return $win_reffile;
        }
    }
    $file;
}

sub next_paragraph {
    my $paragraph = '';

    while ($line = <DATA>) {
        next if $line =~ /^\s*#/;       # skip comment lines
        if ($line =~ /\\$/) {           # support line continuation
            $line =~ s/\\\n/ /;
        }
        $paragraph .= $line if $line =~ /\w/;

        if ($paragraph and ($line =~ /^\s*$/ || eof(DATA))) {
            # end of paragraph
            chomp $paragraph;
            $paragraph = trim_spaces($paragraph);
            return $paragraph;
        }
    }
    return;
}

sub next_test() {
    my ($cmd, $out_ref, $err_ref, @other_ref);

    $TestNo++;
    my $paragraph = next_paragraph();
    return (undef, undef, undef, undef) if !defined $paragraph;
    my @lines = split("\n", $paragraph);

    # The command line must be first
    $cmd = shift @lines;
    foreach my $line (@lines) {
        if ($line =~ m/\.stdout\b/) {
            $out_ref = ref_file(trim_spaces($line));
            next;
        }
        if ($line =~ /\.stderr\b/) {
            $err_ref = ref_file(trim_spaces($line));
            next;
        }

        # any other reference file
        $line = ref_file(trim_spaces($line));
        if (-e $line) {
            push(@other_ref, $line);
        } else {
            unless ($opt_y) {
                printf STDERR "__DATA__: line $.: " .
                          "non-existent reference file: %s\n", $line;
            }
            next;
        }
    }

    if (eof(DATA) && !defined $cmd) {
        return (undef, undef, undef, undef);
    }

    if ($cmd =~ /{VW}/) {
         $cmd = trim_spaces($cmd);
         $cmd =~ s/{VW}/$VW$opt_O/g;
    }

    unless (defined $cmd) {
        die "$0: test $TestNo: command is undefined\n";
    }
    unless (defined $err_ref) {
        v(2, "%s: test %s: stderr ref: undefined\n", $0, $TestNo);
        $err_ref = '/dev/null';
    }
    # print STDERR "next_test: (\$cmd, $out_ref, $err_ref, $pred_ref, $pred)\n";
    if ($opt_V) {
        $cmd = sprintf("%s --log-file='%s' %s",
                        $Valgrind, valgrind_errfile($TestNo), $cmd);
    } elsif ($TimeOut) {
        $cmd = sprintf("%s %u %s", $TimeOut, $TimeOutSec, $cmd);
    }
    ($cmd, $out_ref, $err_ref, @other_ref);
}

#
# If the difference is small (least significant digits of numbers)
# treat it as ok. It may be a result of 32 vs 64 bit calculations.
#
use Scalar::Util qw(looks_like_number);

sub lenient_array_compare($$) {
    my ($w1_ref, $w2_ref) = @_;
    my (@w1) = @$w1_ref;
    my (@w2) = @$w2_ref;

    # print STDERR "lenient_array_compare: (@w1) (@w2)\n";
    if ($#w1 != $#w2) { # arrays not of same size
        if ($opt_v > 3) {
            v(4, "#-of-words in two arrays are different: %d != %d\n", scalar(@w1), scalar(@w2));
            v(4, "line1: "); for (my $i=0; $i <= $#w1; $i++) { v(4, " word[%d]='%s'", $i, $w1[$i]) }; v(4, "\n");
            v(4, "line2: "); for (my $i=0; $i <= $#w2; $i++) { v(4, " word[%d]='%s'", $i, $w2[$i]) }; v(4, "\n");
        }
        return 1;
    }
    my $nelem = scalar @w1;
    for (my $i = 0; $i < $nelem; $i++) {
        my ($word1, $word2) = ($w1[$i], $w2[$i]);
        # print STDERR "\t$word1 == $word2 ?\n";
        next if ($word1 eq $word2);

        # There's some difference, is it significant?
        unless (looks_like_number($word1)) {
            v(4, "$word1 vs $word2: word1=$word1 is not a number!\n");
            return 1;
        }
        unless (looks_like_number($word2)) {
            v(4, "$word1 vs $word2: word2=$word2 is not a number!\n");
            return 1;
        }

        my $delta = abs($word1 - $word2);

        if ($delta > $Epsilon) {
            # We have a 'big enough' difference, but this difference
            # may still not be meaningful in all contexts:

            # Big numbers should be compared by ratio rather than
            # by difference

            # Must ensure we can divide (avoid div-by-0)
            if (abs($word2) <= 1.0) {
                # If numbers are so small (close to zero),
                # ($delta > $Epsilon) suffices for deciding that
                # the numbers are meaningfully different
                v(4, "$word1 vs $word2: delta=$delta > Epsilon=$Epsilon\n");
                return 1;
            }
            # Now we can safely divide (since abs($word2) > 0)
            # and determine the ratio difference from 1.0
            my $ratio_delta = abs($word1/$word2 - 1.0);
            if ($ratio_delta > $Epsilon) {
                v(4, "$word1 vs $word2: ratio_delta=$ratio_delta > Epsilon=$Epsilon\n");
                return 1;
            }
        }
    }
    # print STDERR "lenient_array_compare: no meaningful difference\n";
    return 0; # no meaningful difference
}

sub diff_lenient_float($$) {
    my ($reffile, $outfile) = @_;
    my $status = 0;

    my $tmpf = 'lenient-diff.tmp';
    mysystem("$Diff $DiffOpts $reffile $outfile >$tmpf");
    $status = $? >> 8;
    v(2, "diff produced $tmpf: status=$status\n");
    if (-s $tmpf) {
        # The diff has something in it.
        my $fuzzy_status = 0;   # assume innocent till proven guilty
        open(my $sdiff, $tmpf) || die "$0: diff_lenient_float: $tmpf: $!\n";
        while (<$sdiff>) {
            chomp;
            my ($line1, $line2) = split(/\s*\|\s*/, $_);
            unless (defined($line1) && defined($line2)) {
                my $save_diff_file = "test-$TestNo.lenient-diff";
                warn "$0: test $TestNo: $tmpf: line $.: fuzzy-match missing data on one of the sides. Can't compare\n$_\n";
                warn "$0: test $TestNo: saving lenient diff in '$save_diff_file' for later inspection\n";
                close $sdiff;
                rename($tmpf, $save_diff_file);
                return 1;
            }
            # strip leading spaces if any (happens with --bfgs)
            $line1 =~ s/^\s+//;
            $line2 =~ s/^\s+//;
            v(3, "line1: %s\n", $line1);
            v(3, "line2: %s\n", $line2);

            # Break lines into tokens/words
            my (@w1) = split(/$WordSplit/o, $line1);
            my (@w2) = split(/$WordSplit/o, $line2);
            if (lenient_array_compare(\@w1, \@w2) != 0) {
                $fuzzy_status = 1;
                last;
            }
        }
        close $sdiff;
        $status = $fuzzy_status;
    }
    unlink($tmpf) if ($status == 0);
    $status;
}

#
# perl internal way to emulate 'touch'
#
sub touch(@) {
    my $now = time;
    utime $now, $now, @_;
}

sub display_diff($$) {
    my ($reference_file, $actual_file) = @_;
    my $diff_cmd = "$Diff $DisplayDiffOpts $reference_file $actual_file";

    printf STDERR "--- %s\n", $diff_cmd;
    mysystem($diff_cmd);
}

sub diff($$) {
    my ($reffile, $outfile) = @_;
    my $status = 0;
    $reffile = '' unless (defined $reffile);

    # Special case, empty file w/o reference is not considered a failure.
    # This is a most common case with stdout.
    unless (-e $reffile) {
        if (-s $outfile > 0) {
            warn "$0: test $TestNo: stdout ref: $reffile: $!\n";
            exit 1 if ($opt_e);
            return 2 unless ($opt_o);
        } else {
            # Empty output without a ref is not considered a failure
            v(1, "$0: test $TestNo: empty output w/o reference: ignored.\n");
            return 0;
        }
    }

    # Actually run the diff
    my $diff_cmd = "$Diff $DiffOpts $reffile $outfile";
    my $diftmp = 'diff.tmp';
    mysystem("$diff_cmd >$diftmp");
    $status = $? >> 8;
    v(2, "$diff_cmd >$diftmp: status=$status\n");

    if (-s "$diftmp") {
        # There's some difference
        v(2, "$diftmp has something in it. Is it meaningful?\n");

        if ($opt_f && -e $reffile && -e $outfile &&
            diff_lenient_float($reffile, $outfile) == 0) {

            print STDERR "$0: test $TestNo: minor (<$Epsilon) precision differences ignored\n";
            $status = 0;
        }
        if ($opt_D or ($opt_d && $status)) {
            # Print the diff only iff:
            #   1) -D is in effect  OR
            #   2) -d is in effect and diff is significant
            display_diff($reffile, $outfile);
        }
        if ($opt_o) {
            print STDERR "-o: overwriting reference:\n";

            if (-e $reffile) {
                print STDERR "\t$reffile -> $reffile.prev\n";
                rename($reffile, "$reffile.prev") ||
                    die "FATAL: rename($reffile, $reffile.prev): $!\n";
            }
            print STDERR "\t$outfile -> $reffile\n";
            rename($outfile, $reffile) ||
                die "FATAL: rename($outfile, $reffile): $!\n";

            unless ($opt_e) {
                $status = 0;
            }
        }
    }
    unlink($diftmp) if ($status == 0);
    $status;
}

#
# check_for_time_regression()
#   Compare last overall time to run to current to catch
#   performance regressions
#
my $LastTimeFile = 'RunTests.last.times';

sub write_times($@) {
    my ($file, @times) = @_;
    open(my $fh, ">$file") || die "$0: can't open(>$file): $!\n";
    print $fh join(' ', @times), "\n";
    close $fh;
}
sub read_times($) {
    my ($file) = @_;
    open(my $fh, $file) || die "$0: can't open($file): $!\n";
    my $line = <$fh>; chomp $line;
    close $fh;
    return (split(' ', $line));
}

sub check_for_time_regression() {
    my $tolerate_regress = 1.02;
    my $pct_change = 0.0;
    my ($overall_time0, $overall_time1);
    my ($user0, $system0, $cuser0, $csystem0);
    my ($user1, $system1, $cuser1, $csystem1) = times;
    $overall_time1 = $cuser1 + $csystem1;

    if (-e $LastTimeFile) {
        ($user0, $system0, $cuser0, $csystem0) = read_times($LastTimeFile);
        if (!(defined $csystem0) or !(defined $cuser0)) {
            die "$0: undefined times in saved times file: $LastTimeFile," .
                    " try removing it\n"
        }
        $overall_time0 = $cuser0 + $csystem0;
        $pct_change = 100 * ($overall_time1 - $overall_time0) / (1e-4+$overall_time0);

        if ($overall_time0 == 0) {
            die "$0: Bad times in saved times file: $LastTimeFile," .
                    " try removing it\n"
        } elsif ($overall_time1/$overall_time0 > $tolerate_regress) {
            printf STDERR "$0: RUNTIME REGRESSION: " .
                    "%.2f sec vs last time %.2f sec. (%.2f%% worse)\n",
                    $overall_time1, $overall_time0, $pct_change;
        }
    }
    write_times($LastTimeFile, $user1, $system1, $cuser1, $csystem1);
    printf STDERR
        "$0 runtime: user %g, system %g, total %g sec (%+.2f%% vs. last)\n",
                $cuser1, $csystem1, $overall_time1, $pct_change;
}

# only unlink relative path, plain files
# e.g. avoids trying to unlink /dev/null when running as root
sub safe_unlink($) {
    my $file = shift;
    return 0 if ($file =~ m{^/});
    return 0 unless (-f $file);
    unlink($file);
}

sub run_tests() {
    print STDERR "$0: '-D' to see any diff output\n"
        unless ($opt_D);
    print STDERR "$0: '-d' to see only significant diff output\n"
        unless ($opt_d);
    print STDERR "$0: '-o' to force overwrite references\n"
        unless ($opt_o);
    print STDERR "$0: '-e' to abort/exit on first failure\n"
        unless ($opt_e);

    my ($cmd, $out_ref, $err_ref, $pred_ref, $cmp_ref);
    my ($outf, $errf, $predf, $cmpf);

    mkdir('models', 0755) unless (-d 'models');

    unlink(glob('*.tmp'));
    unlink(glob('*.cache'));
    unlink(glob('*/*.cache'));

    while (($cmd, $out_ref, $err_ref, @more_refs) = next_test()) {
        last unless (defined $cmd);
        if (@ToTest) {
            if ($ToTest[0] != $TestNo) {
                # warn "$0: test $TestNo: skipped\n";
                next;
            } else {
                shift(@ToTest);
            }
        }

        $outf = (defined($out_ref) && -f $out_ref)
                    ? basename($out_ref)
                    : '/dev/null';

        $errf = (defined($err_ref) && -f $err_ref)
                    ? basename($err_ref)
                    : '/dev/null';

        # Run the test
        print STDERR "Test $TestNo: ($cmd) >$outf 2>$errf\n" if ($opt_c);
        mysystem("($cmd) >$outf 2>$errf");
        my $full_status = $?;
        my $status = $full_status >> 8;
        unless ($opt_V) {
            if (my $failure = command_failed($cmd)) {
                print STDERR `$Cat $errf`
                    unless ($failure == 124);
                if ($opt_e) {
                    printf STDERR "$0: exiting with status=$failure\n";
                    exit $failure;
                }
                next;
            }
        }
        if ($status) {
            $ErrorCount++;
            if ($opt_V && $status == 100) {
                my $errfile = valgrind_errfile($TestNo);
                warn "$0: test $TestNo: FAILED: valgrind errors in $errfile\n";
            } elsif ($TimeOut && $status == 124) {
                warn "$0: test $TestNo: FAILED: timeout $TimeOutSec exceeded\n";
            } else {
                warn "$0: test $TestNo: '$cmd' failed: status=$status\n";
            }
            exit $full_status if ($opt_e);
            next;
        }

        # command succeded
        # -- compare stdout
        $status = diff($out_ref, $outf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != stdout(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $out_ref, $outf;

            copy_file($outf, "$outf.test$TestNo") if ($opt_y);
            exit $status if ($opt_e);
        } else {
            if (defined $out_ref) {
                print STDERR "$0: test $TestNo: stdout OK\n";
                safe_unlink($outf);
            } else {
                v(1, "$0: test $TestNo: stdout OK (no reference)\n");
            }
        }

        # -- compare stderr
        if (! -e $err_ref  and  ! $opt_o) {
            $ErrorCount++;
            print STDERR "$0: test $TestNo: FAILED: stderr ref: $err_ref: $!\n\tcmd: $cmd\n";
            exit 1 if ($opt_e);
            next;
        }
        $status = diff($err_ref, $errf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != stderr(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $err_ref, $errf;

            copy_file($errf, "$errf.test$TestNo") if ($opt_y);
            exit $status if ($opt_e);
        } else {
            print STDERR "$0: test $TestNo: stderr OK\n";
            safe_unlink($errf);
        }

        # -- compare all other reference files
        if (@more_refs) {
            foreach my $ref_path (@more_refs) {
                my $ref_base = basename($ref_path);
                # Verify that it exists on the shell line
                unless ($cmd =~ /$ref_base/) {
                    printf STDERR "%s: test %d: FAILED: " .
                        "no match for '%s' in command: '%s'\n" .
                        "Unable to compare output to reference file\n",
                                $0, $TestNo, $ref_base, $cmd;
                    $ErrorCount++;
                    exit $status if ($opt_e);
                    next;
                }
                $status = diff($ref_path, $ref_base);
                if ($status) {
                    $ErrorCount++;
                    printf STDERR "%s: test %d: FAILED: ref(%s) != (%s)\n\tcmd: $cmd\n",
                    $0, $TestNo, $ref_path, $ref_base;
                    copy_file($ref_base, $ref_path) if ($opt_y);
                    exit $status if ($opt_e);
                } else {
                    print STDERR "$0: test $TestNo: $ref_base OK\n";
                    unlink($ref_base);
                }
            }
        }
    }
    if ($FullRun == 0) {
        v(1, "Partial run: not recording overall time\n");
    } elsif ($ErrorCount > 0) {
        v(1, "Errors found: not recording overall time\n");
    } elsif ($opt_V) {
        v(1, "valgrind run: not recording overall time\n");
    } else {
        check_for_time_regression();
    }
}

# --- main
init();
run_tests();
exit $ErrorCount;

#
# Add tests below the __DATA__ line
#
# Each test is a sequence of non-blank lines, terminated
# by an empty line (or EOF), essentially a paragraph.
#
# Each paragraph/test should look like:
#
#   1st line: shell command to run.

#   2nd-to-Nth line: one-or-more reference files to compare outputs to.
#   ONE reference file per line
#   (Note: we indent these lines just for readability.)
#
#   You may break very long lines using \ at EOL.
#
#   # -------------------------------------
#   shell command which may include {VW} ...
#       reference/file1
#       reference/file2
#       ...more reference files...
#
#   # -------------------------------------
#
#   shell-command can be anything accepted by bash, including pipes,
#   redirections, etc., even a sequence of shell-commands separated by ';'
#
#   Inside any shell command, all (optional) appearances of {VW}
#   will be substituted by the vw executable under test.
#
#   By default, 'vw' under our parent dir (../vowpalwabbit/vw) is tested.
#   To run against a different reference executable, pass the
#   wanted executable as an argument to RunTests
#
# The output line-items are reference files to compare outputs to:
#   - *.stdout: expected (reference file) standard output
#   - *.stderr: expected (reference file) standard error
#   - Any other relative path, pointing to a reference file to compare
#     to, this allows adding references to any explicitly named file
#     appearing on the shell-line, the only requirement is that the
#     _basename_ (path stripped of directory) of the reference file
#     would exactly match its respective file in the shell-command.
#
# For example:
#
#   #-------------------------------------------------------
#   # Test 237
#   {VW} ... -p test75.predict --readable_model test75.rmodel
#       test/train-sets/ref/test75.stderr
#       test/pred-sets/ref/test75.predict
#       test/whatever/ref/test75.rmodel
#
#   #-------------------------------------------------------
#
# All reference filenames are relative to this (test) directory
#
# Only the STDOUT and STDERR streams in the shell command
# are implicit (so only their reference files need to be specified):
# The implicit names would be matched only by their extension
# as opposed to the full basename of the file.
#
# The two implicit names are:
#       TestXXX.stdout
#       TestXXX.stderr
#
# Windows note:
#
#   Due to differences in Random-Number-Generators in Windows,
#   floating-point outputs may differ in some tests (not all).
#
#   To minimize the need for changes (leverage existing tests and
#   reference files as much as possible), on Windows we check for
#   existence of files with '-mswin' suffix:
#       *.stderr-mswin
#       *.stdout-mswin
#   and if any of them exists, we use it instead.
#
__DATA__
# Test 1:
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat \
    -f models/0001_1.model -c --passes 8 --invariant \
    --ngram 3 --skips 1 --holdout_off
        train-sets/ref/0001.stderr

# Test 2: checking predictions as well
{VW} -k -t -d train-sets/0001.dat -i models/0001_1.model -p 0001.predict --invariant
    test-sets/ref/0001.stderr
    pred-sets/ref/0001.predict

# Test 3: without -d, training only
{VW} -k -d train-sets/0002.dat -f models/0002.model --invariant
    train-sets/ref/0002.stderr

# Test 4: same, with -d
{VW} -k -d train-sets/0002.dat -f models/0002.model --invariant
    train-sets/ref/0002.stdout
    train-sets/ref/0002.stderr

# Test 5: add -q .., adaptive, and more (same input, different outputs)
{VW} -k --initial_t 1 --adaptive --invariant -q Tf -q ff -f models/0002a.model -d train-sets/0002.dat
    train-sets/ref/0002a.stderr

# Test 6: run predictions on Test 4 model
# Pretending the labels aren't there
{VW} -k -t -i models/0002.model -d train-sets/0002.dat -p 0002b.predict
    test-sets/ref/0002b.stderr
    pred-sets/ref/0002b.predict

# Test 7: using normalized adaptive updates and a low --power_t
{VW} -k --power_t 0.45 -f models/0002c.model -d train-sets/0002.dat
    train-sets/ref/0002c.stderr

# Test 8: predicts on test 7 model
{VW} -k -t -i models/0002c.model -d train-sets/0002.dat -p 0002c.predict
    test-sets/ref/0002c.stderr
    pred-sets/ref/0002c.predict

# Test 9: label-dependent features with csoaa_ldf
{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.csoaa.predict --passes 10 --invariant --csoaa_ldf multiline --holdout_off --noconstant
    train-sets/ref/cs_test.ldf.csoaa.stderr
    train-sets/ref/cs_test.ldf.csoaa.predict

# Test 10: label-dependent features with wap_ldf
{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.wap.predict --passes 10 --invariant --wap_ldf multiline --holdout_off --noconstant
    train-sets/ref/cs_test.ldf.wap.stderr
    train-sets/ref/cs_test.ldf.wap.predict

# Test 11: one-against-all
{VW} -k --oaa 10 -c --passes 10 -d train-sets/multiclass --holdout_off
    train-sets/ref/oaa.stderr

# Test 12: Error Correcting Tournament
{VW} -k --ect 10 --error 3 -c --passes 10 --invariant -d train-sets/multiclass --holdout_off
    train-sets/ref/multiclass.stderr

# Test 13: Run search (dagger) on wsj_small for 6 passes extra features
{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 6 \
    --search_task sequence --search 45 --search_alpha 1e-6 \
    --search_max_bias_ngram_length 2 --search_max_quad_ngram_length 1 \
    --holdout_off
        train-sets/ref/search_wsj.stderr

# Test 14: Run search (searn) on wsj_small for 6 passes extra features
{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 6 \
    --search_task sequence --search 45 --search_alpha 1e-6 \
    --search_max_bias_ngram_length 2 --search_max_quad_ngram_length 1 \
    --holdout_off --search_passes_per_policy 3 --search_interpolation policy
        train-sets/ref/search_wsj2.dat.stdout
        train-sets/ref/search_wsj2.dat.stderr

# Test 15: LBFGS on zero derivative input
{VW} -k -c -d train-sets/zero.dat --loss_function=squared -b 20 --bfgs --mem 7 --passes 5 --l2 1.0 --holdout_off
    train-sets/ref/zero.stdout
    train-sets/ref/zero.stderr

# Test 16: LBFGS early termination
{VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 --holdout_off
    train-sets/ref/rcv1_small.stdout
    train-sets/ref/rcv1_small.stderr

# Test 17: Run LDA with 100 topics on 1000 Wikipedia articles
{VW} -k --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -l 1 -b 13 --minibatch 128 -d train-sets/wiki256.dat
    train-sets/ref/wiki1K.stderr

# Test 18: Run search on seq_small for 12 passes, 4 passes per policy
{VW} -k -c -d train-sets/seq_small --passes 12 --invariant --search 4 --search_task sequence --holdout_off
    train-sets/ref/search_small.stderr

# Test 19: neural network 3-parity with 2 hidden units
{VW} -k -c -d train-sets/3parity --hash all --passes 3000 -b 16 --nn 2 -l 10 --invariant -f models/0021.model --random_seed 19 --quiet --holdout_off
    train-sets/ref/3parity.stderr

# Test 20: neural network 3-parity with 2 hidden units (predict)
{VW} -d train-sets/3parity -t -i models/0021.model -p 0022.predict
    pred-sets/ref/0022.stderr
    pred-sets/ref/0022.predict

# Test 21: cubic features -- on a parity test case
{VW} -k -c -f models/xxor.model -d train-sets/xxor.dat --cubic abc --passes 100 --holdout_off --progress 1.33333
    train-sets/ref/xxor.stderr

# Test 22: matrix factorization -- training
{VW} -k -d train-sets/ml100k_small_train -b 16 -q ui --rank 10 \
    --l2 2e-6 --learning_rate 0.05 --passes 2 \
    --decay_learning_rate 0.97 --power_t 0 -f models/movielens.reg \
    -c --loss_function classic --holdout_off
        train-sets/ref/ml100k_small.stdout
        train-sets/ref/ml100k_small.stderr

# Test 23: matrix factorization -- testing
{VW} -i models/movielens.reg -t -d test-sets/ml100k_small_test
    test-sets/ref/ml100k_small.stdout
    test-sets/ref/ml100k_small.stderr

# Test 24: active-learning -- training
{VW} -k --active --simulation --mellowness 0.000001 -d train-sets/rcv1_small.dat -l 10 --initial_t 10 --random_seed 3
    train-sets/ref/active-simulation.t24.stderr

# Test 25: bagging -- training regressor
{VW} -k -d train-sets/0002.dat -f models/bs.reg.model --bootstrap 4 -p bs.reg.predict
    train-sets/ref/bs.reg.stderr
    train-sets/ref/bs.reg.predict

# Test 26: bagging -- predicting with bagged regressor
{VW} -d train-sets/0002.dat -i models/bs.reg.model -p bs.prreg.predict -t
    train-sets/ref/bs.prreg.stderr
    train-sets/ref/bs.prreg.predict

# Test 27: bagging -- binary classifiers
{VW} -d train-sets/0001.dat -f models/bs.vote.model --bootstrap 4 --bs_type vote -p bs.vote.predict
    train-sets/ref/bs.vote.stderr
    train-sets/ref/bs.vote.predict

# Test 28: bagging -- predict with bagged classifier
{VW} -d train-sets/0001.dat -i models/bs.vote.model -p bs.prvote.predict -t
    train-sets/ref/bs.prvote.stderr
    train-sets/ref/bs.prvote.predict

# Test 29: affix features
{VW} -d train-sets/affix_test.dat -k -c --passes 10 --holdout_off --affix -2
    train-sets/ref/affix_test.stderr

# Test 30: train --l1 regularized model
{VW} -d train-sets/0001.dat -f models/mask.model --invert_hash mask.predict --l1 0.01
    train-sets/ref/mask.stderr

# Test 31: train model using --feature_mask
{VW} -d train-sets/0001.dat --invert_hash remask.predict --feature_mask models/mask.model -f models/remask.model
    train-sets/ref/remask.stderr

# Test 32: train model using --feature_mask and --initial_regressor
{VW} -d train-sets/0001.dat --feature_mask models/mask.model -i models/remask.model
    train-sets/ref/remask.final.stderr

# Test 33: train model for topk recommender
{VW} -d train-sets/topk.vw -f topk.model -q MF --passes 100 --cache_file topk-train.cache -k --holdout_off
    train-sets/ref/topk-train.stderr

# Test 34: train model for topk recommender
{VW} -P 1 -d train-sets/topk.vw -i topk.model --top 2 -p topk-rec.predict
    train-sets/ref/topk-rec.stderr
    train-sets/ref/topk-rec.predict

# Test 35: non-centered data-set where constant >> 0
#   To test the new --constant option without which performance is very weak
{VW} -k --passes 100 -c --holdout_off --constant 1000 -d train-sets/big-constant.dat
    train-sets/ref/big-constant.stderr

# Test 36: new option: --progress w/ integer arg
{VW} -k -d train-sets/0001.dat --progress 10
    train-sets/ref/progress-10.stderr

# Test 37: new-option: --progress w/ floating-point arg
#           + alternate short form (-P)
{VW} -k -d train-sets/0001.dat -P 0.5
    train-sets/ref/progress-0.5.stderr

# Test 38: --nn without --quiet to avoid nn regressions
#   (Needs to be a simple test, not one sensitive to symmetry breaking)
{VW} -k -d train-sets/0001.dat --nn 1
    train-sets/ref/nn-1-noquiet.stderr

# Test 39: cb with dr
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dr --ngram 2 --skips 4 -b 24 -l 0.25
    train-sets/ref/rcv1_raw_cb_dr.stderr

# Test 40: cb with ips
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type ips --ngram 2 --skips 4 -b 24 -l 0.125
    train-sets/ref/rcv1_raw_cb_ips.stderr

# Test 41: cb with dm
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dm --ngram 2 --skips 4 -b 24 -l 0.125 -f cb_dm.reg
    train-sets/ref/rcv1_raw_cb_dm.stderr

# Test 42: --lda --passes 2 hang regression
{VW} -k -d train-sets/lda-2pass-hang.dat --lda 10 -c --passes 2 --holdout_off
    train-sets/ref/lda-2pass-hang.stderr

# Test 43: search sequence labeling, non-ldf train
{VW} -k -c -d train-sets/sequence_data --passes 20 --invariant --search_rollout ref --search_alpha 1e-8 --search_task sequence --search 5 --holdout_off -f models/sequence_data.model
    train-sets/ref/sequence_data.nonldf.train.stderr

# Test 44: search sequence labeling, non-ldf test
{VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.nonldf.test.predict
    train-sets/ref/sequence_data.nonldf.test.stderr
    train-sets/ref/sequence_data.nonldf.test.predict

# Test 45: make sure that history works
{VW} -k -c -d train-sets/seq_small2 --passes 4 --search 4 --search_task sequence --holdout_off
    train-sets/ref/search_small2.stderr

# Test 46: search sequence labeling, ldf train
{VW} -k -c -d train-sets/sequence_data --passes 20 \
    --search_rollout ref --search_alpha 1e-8 \
    --search_task sequence_demoldf --csoaa_ldf m --search 5 \
    --holdout_off -f models/sequence_data.ldf.model --noconstant
        train-sets/ref/sequence_data.ldf.train.stderr

# Test 47: search sequence labeling, ldf test
{VW} -d train-sets/sequence_data -t -i models/sequence_data.ldf.model -p sequence_data.ldf.test.predict --noconstant
    train-sets/ref/sequence_data.ldf.test.stderr
    train-sets/ref/sequence_data.ldf.test.predict

# Test 48: search sequence SPAN labeling BIO, non-ldf train, no rollouts
{VW} -k -c -d train-sets/sequencespan_data --passes 20 --invariant \
    --search_rollout none --search_task sequencespan --search 7 \
    --holdout_off -f models/sequencespan_data.model
        train-sets/ref/sequencespan_data.nonldf.train.stderr

# Test 49: search sequence SPAN labeling BIO, non-ldf test
{VW} -d train-sets/sequencespan_data -t -i models/sequencespan_data.model -p sequencespan_data.nonldf.test.predict
    train-sets/ref/sequencespan_data.nonldf.test.stderr
    train-sets/ref/sequencespan_data.nonldf.test.predict

# Test 50: search sequence SPAN labeling BILOU, non-ldf train
{VW} -k -c -d train-sets/sequencespan_data --passes 20 --invariant \
    --search_rollout ref --search_alpha 1e-8 --search_task sequencespan \
    --search_span_bilou --search 7 --holdout_off \
    -f models/sequencespan_data.model
        train-sets/ref/sequencespan_data.nonldf-bilou.train.stderr

# Test 51: search sequence SPAN labeling BILOU, non-ldf test
{VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.nonldf-bilou.test.predict
    train-sets/ref/sequencespan_data.nonldf-bilou.test.stderr
    train-sets/ref/sequencespan_data.nonldf-bilou.test.predict

# Test 52: silly test for "argmax" task
{VW} -d train-sets/argmax_data -k -c --passes 20 --search_rollout ref --search_alpha 1e-8 --search_task argmax --search 2 --holdout_off
    train-sets/ref/argmax_data.stderr

# Test 53: (holdout-broken regression)
# ensure we have no holdout loss of '0 h'
{VW} -k -c --passes 2 -d train-sets/0001.dat
    train-sets/ref/holdout-loss-not-zero.stderr

# Test 54: stagewise poly with exponent 0.25
####in the following stage_poly tests, there are minute differences in losses, which are not being fuzzy-diffed;
####thus the stderr is cleared (--quiet) and only comparing (fuzzy-diffed) predictions.
{VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat -p stage_poly.s025.predict --quiet
    train-sets/ref/stage_poly.s025.stderr
    train-sets/ref/stage_poly.s025.predict

# Test 55: stagewise poly with exponent 1.0
{VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat --quiet
    train-sets/ref/stage_poly.s100.stderr

# Test 56: stagewise poly with exponent 0.25 and doubling batches
{VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s025.doubling.predict --quiet
    train-sets/ref/stage_poly.s025.doubling.stderr
    train-sets/ref/stage_poly.s025.doubling.predict

# Test 57: stagewise poly with exponent 1.0 and doubling batches
{VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s100.doubling.predict --quiet
    train-sets/ref/stage_poly.s100.doubling.stderr
    train-sets/ref/stage_poly.s100.doubling.predict

# Test 58: library test, train the initial model
{VW} -c -k -d train-sets/library_train -f models/library_train.w -q st --passes 100 --hash all --noconstant --csoaa_ldf m --holdout_off
    train-sets/ref/library_train.stdout
    train-sets/ref/library_train.stderr

# Test 60: empty test, bad builds (without make clean)
# sometimes cause a SEGV even on empty input
echo "" | {VW}
    train-sets/ref/empty-set.stderr

# Test 61: daemon test
./daemon-test.sh
    test-sets/ref/vw-daemon.stdout

# Test 62: SVM linear kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 -p ksvm_train.linear.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.linear.stderr
    train-sets/ref/ksvm_train.linear.predict

# Test 63: SVM polynomial kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 --kernel poly -p ksvm_train.poly.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.poly.stderr
    train-sets/ref/ksvm_train.poly.predict

# Test 64: SVM rbf kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 --kernel rbf -p ksvm_train.rbf.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.rbf.stderr
    train-sets/ref/ksvm_train.rbf.predict

# Test 65: Run search (dagger) on an entity-relation recognitions data set,
# er_small, for 6 passes with constraints
{VW} -k -c -d train-sets/er_small.vw --passes 6 --search_task entity_relation --search 10 --constraints --search_alpha 1e-8
    train-sets/ref/search_er.stderr

# Test 66: Train a depenency parser with search (dagger)
# on wsj_small.dparser.vw.gz for 6 passes
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz --passes 6 --search_task dep_parser --search 12  --search_alpha 1e-4 --search_rollout oracle --holdout_off
    train-sets/ref/search_dep_parser.stderr

# Test 67: classification with data from dictionaries
# (eg embeddings or gazetteers) -- note that this is impossible without
# dictionaries because --ignore w; also test to make sure gzipped dicts
# work and dictionary redundancy checking works
{VW} -k -c -d train-sets/dictionary_test.dat --binary --ignore w --holdout_off --passes 32 --dictionary w:dictionary_test.dict --dictionary w:dictionary_test.dict.gz --dictionary_path train-sets
    train-sets/ref/dictionary_test.stderr

# Test 68: Search for multiclass classification
{VW} -k -c -d train-sets/multiclass.sch --passes 20 --search_task multiclasstask --search 10 --search_alpha 1e-4 --holdout_off
    train-sets/ref/search_multiclass.stderr

# Test 69: (see Test 43/Test 44): search sequence labeling, with selective branching
{VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.nonldf.beam.test.predict --search_metatask selective_branching --search_max_branch 10 --search_kbest 10
    train-sets/ref/sequence_data.nonldf.beam.test.stderr
    train-sets/ref/sequence_data.nonldf.beam.test.predict

# Test 70: (see Test 46/47) search sequence labeling, ldf test, with selective branching
{VW} -d train-sets/sequence_data -t -i models/sequence_data.ldf.model -p sequence_data.ldf.beam.test.predict --search_metatask selective_branching --search_max_branch 10 --search_kbest 10 --noconstant
    train-sets/ref/sequence_data.ldf.beam.test.stderr
    train-sets/ref/sequence_data.ldf.beam.test.predict

# Test 71: autolink
{VW} -d train-sets/0002.dat --autolink 1 --examples 100 -p 0002.autolink.predict
    train-sets/ref/0002.autolink.stderr
    train-sets/ref/0002.autolink.predict

# Test 72: train FTRL-Proximal
{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 1 --ftrl --ftrl_alpha 0.01 --ftrl_beta 0 --l1 2
    train-sets/ref/0001_ftrl.stderr

# Test 73: test FTRL-Proximal
{VW} -k -t -d train-sets/0001.dat -i models/0001_ftrl.model -p 0001_ftrl.predict
    test-sets/ref/0001_ftrl.stderr
    pred-sets/ref/0001_ftrl.predict

# Test 74: cb evaluation
{VW} -d train-sets/rcv1_cb_eval --cb 2 --eval
    train-sets/ref/rcv1_cb_eval.stderr

# Test 75: Log_multi
{VW} --log_multi 10 -d train-sets/multiclass
    train-sets/ref/log_multi.stderr

# Test 76: cbify, epsilon-greedy
{VW} --cbify 10 --epsilon 0.05 -d train-sets/multiclass
    train-sets/ref/cbify_epsilon.stderr

# Test 77: cbify, tau first
{VW} --cbify 10 --first 5 -d train-sets/multiclass
    train-sets/ref/cbify_first.stderr

# Test 78: cbify, bag
{VW} --cbify 10 --bag 7 -d train-sets/multiclass
    train-sets/ref/cbify_bag.stderr

# Test 79: cbify, cover
{VW} --cbify 10 --cover 3 -d train-sets/multiclass
    train-sets/ref/cbify_cover.stderr

# Test 80: lrq empty namespace
{VW} --lrq aa3 -d train-sets/0080.dat
    train-sets/ref/0080.stderr

# Test 81: train FTRL-PiSTOL
{VW} -k -d train-sets/0001.dat -f models/ftrl_pistol.model --passes 1 --pistol
    train-sets/ref/ftrl_pistol.stderr

# Test 82: test FTRL-PiSTOL
{VW} -k -t -d train-sets/0001.dat -i models/ftrl_pistol.model -p ftrl_pistol.predict
    test-sets/ref/ftrl_pistol.stderr
    pred-sets/ref/ftrl_pistol.predict

# Test 83: check redefine functionality
{VW} -k -d train-sets/0080.dat --redefine := --redefine y:=: --redefine x:=arma --ignore x -q yy
    train-sets/ref/redefine.stderr

# Test 84: check cb_adf
{VW} --cb_adf -d train-sets/cb_test.ldf --noconstant
    train-sets/ref/cb_adf.stderr

# Test 85: check multilabel_oaa
{VW} --multilabel_oaa 10 -d train-sets/multilabel -p multilabel.predict
    train-sets/ref/multilabel.stderr
    pred-sets/ref/multilabel.predict

# Test 86: check --csoaa_rank on csoaa_ldf
{VW} --csoaa_ldf multiline --csoaa_rank -d train-sets/cs_test_multilabel.ldf -p multilabel_ldf.predict --noconstant
    train-sets/ref/multilabel_ldf.stderr
    pred-sets/ref/multilabel_ldf.predict

# Test 87: check --rank_all on csoaa_ldf
{VW} --cb_adf --rank_all -d train-sets/cb_test.ldf -p cb_adf_rank.predict --noconstant
    train-sets/ref/cb_adf_rank.stderr
    pred-sets/ref/cb_adf_rank.predict

# Test 88: named labels at training time
{VW} --named_labels det,noun,verb --oaa 3 -d train-sets/test_named  -k -c --passes 10 --holdout_off -f models/test_named.model
    train-sets/ref/test_named_train.stderr

# Test 89: named labels at prediction
{VW} -i models/test_named.model -t -d train-sets/test_named -p test_named.predict
    train-sets/ref/test_named_test.stderr
    pred-sets/ref/test_named.predict

# Test 90: named labels at training time (csoaa)
{VW} --named_labels det,noun,verb --csoaa 3 -d train-sets/test_named_csoaa  -k -c --passes 10 --holdout_off -f models/test_named_csoaa.model
    train-sets/ref/test_named_csoaa_train.stderr

# Test 91: named labels at prediction (csoaa)
{VW} -i models/test_named_csoaa.model -t -d train-sets/test_named_csoaa -p test_named_csoaa.predict
    train-sets/ref/test_named_csoaa_test.stderr
    pred-sets/ref/test_named_csoaa.predict

# Test 92: check -q :: and -oaa inverse hash
printf '3 |f a b c |e x y z\n2 |f a y c |e x\n' | \
    {VW} --oaa 3 -q :: --invert_hash inv_hash.cmp && \
        tail -n +2 inv_hash.cmp > inv_hash.cmp.new && \
            rm inv_hash.cmp && \
                mv inv_hash.cmp.new inv_hash.cmp
    train-sets/ref/inv_hash.stderr
    pred-sets/ref/inv_hash.cmp

#Test 93:  check cb_adf with doubly robust option
{VW} --cb_adf --rank_all -d train-sets/cb_test.ldf -p cb_adf_dr.predict --cb_type dr
    train-sets/ref/cb_adf_dr.stderr
    pred-sets/ref/cb_adf_dr.predict

# Test 94: experience replay version of test 1
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat \
    -c --passes 8 --invariant \
    --ngram 3 --skips 1 --holdout_off --replay_b 100
        train-sets/ref/0001-replay.stderr

# Test 95: named labels at training time (csoaa) with experience replay
{VW} --named_labels det,noun,verb --csoaa 3 \
    -d train-sets/test_named_csoaa -k -c --passes 10 --holdout_off \
    -f models/test_named_csoaa.model --replay_c 100
        train-sets/ref/test_named_csoaa_train-replay.stderr

# Test 96: backwards compatibility
printf '3 |f a b c |e x y z\n2 |f a y c |e x\n' | \
    {VW} -i simple_model --invert_hash inv_hash.cmp && \
        tail -n +2 inv_hash.cmp
   test-sets/ref/backwards.stderr
   test-sets/ref/backwards.stdout

# Test 97:
{VW} -d train-sets/0001.dat -f models/0097.model --save_resume
        train-sets/ref/0097.stderr

# Test 98: checking predictions as well
{VW} --preserve_performance_counters -d train-sets/0001.dat -i models/0097.model -p 0098.predict
    test-sets/ref/0098.stderr
    pred-sets/ref/0098.predict

# Test 99: checking predictions with testing
{VW} -d train-sets/0001.dat -i models/0097.model -p 0099.predict
    test-sets/ref/0099.stderr
    pred-sets/ref/0099.predict

# Test 100: action costs, no rollout
{VW} -k -c -d train-sets/sequence_data --passes 20 --invariant --search_rollout none --search_task sequence_ctg --search 5 --holdout_off
    train-sets/ref/sequence_data.ctg.train.stderr

# Test 101: active cover
{VW} --loss_function logistic --binary --active_cover -d train-sets/rcv1_mini.dat -f models/active_cover.model
    train-sets/ref/active_cover.stderr

# Test 102: active cover (predict)
{VW} -i models/active_cover.model -t -d test-sets/rcv1_small_test.data -p active_cover.predict
    test-sets/ref/active_cover.stderr
    pred-sets/ref/active_cover.predict

# Test 103: active cover oracular
{VW} --loss_function logistic --binary --active_cover --oracular -d ./train-sets/rcv1_small.dat
    train-sets/ref/active_cover_oracular.stderr

# Test 104: check cb_adf
{VW} --cb_adf -d train-sets/cb_test.ldf --cb_type mtr --noconstant
    train-sets/ref/cb_adf_mtr.stderr

# Test 105: train FTRL-Proximal early stopping
{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 10 --ftrl --ftrl_alpha 3.0 --ftrl_beta 0 --l1 0.9 --cache
    train-sets/ref/0001_ftrl_holdout.stderr

# Test 106: test FTRL-Proximal early stopping prediction
{VW} -k -t -d train-sets/0001.dat -i models/0001_ftrl.model -p 0001_ftrl_holdout.predict
    test-sets/ref/0001_ftrl_holdout_106.stderr
    pred-sets/ref/0001_ftrl_holdout.predict

# Test 107: train FTRL-Proximal no early stopping
{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 10 --ftrl --ftrl_alpha 0.01 --ftrl_beta 0 --l1 2 --cache --holdout_off
    train-sets/ref/0001_ftrl_holdout_off.stderr

# Test 108: test FTRL-Proximal no early stopping
{VW} -k -t -d train-sets/0001.dat -i models/0001_ftrl.model -p 0001_ftrl_holdout_off.predict --holdout_off
    test-sets/ref/0001_ftrl_holdout_off.stderr
    pred-sets/ref/0001_ftrl_holdout_off.predict

# Test 109: --probabilities --oaa
{VW} -d train-sets/probabilities.dat --probabilities --oaa=4 --loss_function=logistic -p oaa_probabilities.predict
   train-sets/ref/oaa_probabilities.stderr
   pred-sets/ref/oaa_probabilities.predict

# Test 110: --probabilities --csoaa_ldf=mc
{VW} -d train-sets/cs_test.ldf --probabilities --csoaa_ldf=mc --loss_function=logistic -p csoaa_ldf_probabilities.predict
   train-sets/ref/csoaa_ldf_probabilities.stderr
   pred-sets/ref/csoaa_ldf_probabilities.predict

# Test 111: Train a depenency parser with neural network and one_learner approach (lols)
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz -b 20 --search_task dep_parser --search 25 --search_alpha 1e-5 --search_rollin mix_per_roll --search_rollout oracle --one_learner --nn 5 --ftrl --search_history_length 3 --root_label 8
    train-sets/ref/search_dep_parser_one_learner.stderr

# Test 112: Train a depenency parser with cost_to_go
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz -b 20 --passes 6 --search_task dep_parser --search 25 --search_alpha 1e-5 --search_rollin mix_per_roll --search_rollout none --holdout_off --search_history_length 3 --root_label 8 --cost_to_go
    train-sets/ref/search_dep_parser_cost_to_go.stderr

# Test 113: Predictions with confidences
{VW} --confidence -d ./train-sets/rcv1_micro.dat --initial_t 0.1 -p confidence.preds
    train-sets/ref/confidence.stderr
    pred-sets/ref/confidence.preds

# Test 114: Over size example test
{VW} -d train-sets/x.txt
    train-sets/ref/oversize.stderr

# Test 115: Long Line test
{VW} -d train-sets/long_line -c -k
    train-sets/ref/long_line.stderr

# Test 116: MWT test
{VW} -d train-sets/cb_eval --multiworld_test f -p cb_eval.preds
    train-sets/ref/cb_eval.stderr
    pred-sets/ref/cb_eval.preds

# Test 117: Audit regressor of ftrl model (from test #107)
{VW} -d train-sets/0001.dat -i models/0001_ftrl.model  --audit_regressor ftrl.audit_regr
    train-sets/ref/ftrl_audit_regr.stderr
    train-sets/ref/ftrl.audit_regr

# Test 118: Audit regressor of csoaa model (from test #95)
{VW} -d train-sets/test_named_csoaa -i models/test_named_csoaa.model --audit_regressor csoaa.audit_regr
    train-sets/ref/csoaa_audit_regr.stderr
    train-sets/ref/csoaa.audit_regr

# Test 119: MWT learn test
{VW} -d train-sets/cb_eval --multiworld_test f --learn 2 -p mwt_learn.preds
    train-sets/ref/mwt_learn.stderr
    pred-sets/ref/mwt_learn.preds

# Test 120: MWT learn exclude test
{VW} -d train-sets/cb_eval --multiworld_test f --learn 2 --exclude_eval -p mwt_learn_exclude.preds
    train-sets/ref/mwt_learn_exclude.stderr
    pred-sets/ref/mwt_learn_exclude.preds

# Test 121: cb_explore
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb_explore 2 --ngram 2 --skips 4 -b 24 -l 0.25 -p rcv1_raw_cb_explore.preds
    train-sets/ref/rcv1_raw_cb_explore.stderr
    pred-sets/ref/rcv1_raw_cb_explore.preds

# Test 122: Predictions with confidences after training
{VW} --confidence --confidence_after_training --initial_t 0.1 -d ./train-sets/rcv1_small.dat -p confidence_after_training.preds
    train-sets/ref/confidence_after_training.stderr
    pred-sets/ref/confidence_after_training.preds

# Test 123: cb_eval save/load #1
{VW} -d train-sets/cb_eval1 --multiworld_test f -f mwt.model -p cb_eval1.preds
    train-sets/ref/cb_eval1.stderr
    pred-sets/ref/cb_eval1.preds

# Test 124: cb_eval save/load #2
{VW} -d train-sets/cb_eval2 -i mwt.model -p cb_eval2.preds
    train-sets/ref/cb_eval2.stderr
    pred-sets/ref/cb_eval2.preds

# Test 125: arc-eager trasition-based dependency parser
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz -b 20 --search_task dep_parser --search 26 --search_alpha 1e-5 --search_rollin mix_per_roll --search_rollout oracle --one_learner --search_history_length 3 --root_label 8 --transition_system 2 --passes 8
    train-sets/ref/search_dep_parser_arceager.stderr

# Test 126: recall tree hello world
{VW} --quiet -d train-sets/gauss1k.dat.gz -f models/recall_tree_g100.model --recall_tree 100 -b 20 --loss_function logistic

# Test 127: recall_tree hello world predict-from-saved-model
{VW} -t -d train-sets/gauss1k.dat.gz -i models/recall_tree_g100.model
    train-sets/ref/recall_tree_gauss1k.stderr
    train-sets/ref/recall_tree_gauss1k.stdout

# Test 128: cb_explore_adf with epsilon-greedy exploration
{VW} --cb_explore_adf --epsilon 0.1 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_epsilon.predict
    train-sets/ref/cbe_adf_epsilon.stderr
    pred-sets/ref/cbe_adf_epsilon.predict

# Test 129: cb_explore_adf with softmax exploration
{VW} --cb_explore_adf --softmax --lambda 1 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_softmax.predict
    train-sets/ref/cbe_adf_softmax.stderr
    pred-sets/ref/cbe_adf_softmax.predict

# Test 130: cb_explore_adf with bagging exploration
{VW} --cb_explore_adf --bag 3 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_bag.predict
    train-sets/ref/cbe_adf_bag.stderr
    pred-sets/ref/cbe_adf_bag.predict

# Test 131: cb_explore_adf with explore-first exploration
{VW} --cb_explore_adf --first 2 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_first.predict
    train-sets/ref/cbe_adf_first.stderr
    pred-sets/ref/cbe_adf_first.predict

# Test 132: train a poisson model
{VW} --quiet -d train-sets/poisson.dat -f models/poisson.model --loss_function poisson --link poisson -b 2 -p poisson.train.predict
    train-sets/ref/poisson.train.stderr
    pred-sets/ref/poisson.train.predict

# Test 133: train a poisson model without invariant updates
{VW} --quiet -d train-sets/poisson.dat -f models/poisson.normalized.model --normalized --loss_function poisson --link poisson -b 2 -l 0.1 -p poisson.train.normalized.predict
    train-sets/ref/poisson.train.normalized.stderr
    pred-sets/ref/poisson.train.normalized.predict

# Test 134: second order online learning
{VW} --OjaNewton -d train-sets/0001.dat -f models/second_order.model -p second_order.predict
    train-sets/ref/second_order.stderr
    pred-sets/ref/second_order.predict

# Test 135: cb explore adf
{VW} -d train-sets/cb_adf_crash_1.data -f models/cb_adf_crash.model --cb_explore_adf --epsilon 0.05
    train-sets/ref/cb_adf_crash1.stderr

# Test 136: cb explore adf predict
{VW} -d train-sets/cb_adf_crash_2.data -i models/cb_adf_crash.model -t
    train-sets/ref/cb_adf_crash2.stderr

# Test 137: Fix for regression introduced by badeedb.
# Ensure audit output continues to work correctly in the presence of anon features.
# Github issue 1038 (https://github.com/JohnLangford/vowpal_wabbit/issues/1038)
{VW} --audit -d train-sets/audit.dat --noconstant
    train-sets/ref/audit.stderr
    train-sets/ref/audit.stdout

# Test 138: cb_explore_adf with cover exploration
{VW} --cb_explore_adf --cover 3 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_cover.predict
    train-sets/ref/cbe_adf_cover.stderr
    pred-sets/ref/cbe_adf_cover.predict

# Test 139: cb_explore_adf with cover exploration + double robust
{VW} --cb_explore_adf --cover 3 --cb_type dr -d train-sets/cb_test.ldf --noconstant -p cbe_adf_cover_dr.predict
    train-sets/ref/cbe_adf_cover_dr.stderr
    pred-sets/ref/cbe_adf_cover_dr.predict

# Test 140: marginal features
{VW} --marginal f  -d train-sets/marginal_features --noconstant --initial_numerator 0.5 --initial_denominator 1.0 --decay 0.001 --holdout_off -c -k --passes 100 -f marginal_model
    train-sets/ref/marginal.stderr

# Test 141: marginal features test
{VW} -i marginal_model  -d train-sets/marginal_features --noconstant -t
    train-sets/ref/marginal_test.stderr

# Test 142: Evaluate exploration on contextal bandit data
{VW} --explore_eval --epsilon 0.2 -d train-sets/cb_test.ldf --noconstant -p explore_eval.predict
    train-sets/ref/explore_eval.stderr
    pred-sets/ref/explore_eval.predict

# Test 143: Test 1 using JSON
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.json --json \
    -c --passes 8 --invariant \
    --ngram 3 --skips 1 --holdout_off
        train-sets/ref/0001.json.stderr

# Test 144: cb_explore_adf with cover exploration + double robust
{VW} --cb_explore_adf --cover 3 --cb_type dr -d train-sets/cb_test.json --json --noconstant -p cbe_adf_cover_dr.predict
    train-sets/ref/cbe_adf_cover_dr.json.stderr
    pred-sets/ref/cbe_adf_cover_dr.predict

# Test 145: mix labeled and unlabeled examples with --bootstrap bug:
# https://github.com/JohnLangford/vowpal_wabbit/issues/1111
{VW} --bootstrap 2 -d train-sets/labeled-unlabeled-mix.dat
    train-sets/ref/labeled-unlabeled-mix.stderr

# Test 146: cb_explore_adf with cover exploration + double robust (using more than 256 examples)
{VW} --cb_explore_adf --cover 3 --cb_type dr -d train-sets/cb_test256.json --json --noconstant -p cbe_adf_cover_dr256.predict
    train-sets/ref/cbe_adf_cover_dr256.json.stderr
    pred-sets/ref/cbe_adf_cover_dr256.predict

# Test 147: --scores --oaa
{VW} -d train-sets/probabilities.dat --scores --oaa=4 -p oaa_scores.predict
   train-sets/ref/oaa_scores.stderr
   pred-sets/ref/oaa_scores.predict

# Test 148:  check cb_adf with direct method option
{VW} --cb_adf -d train-sets/cb_test.ldf -p cb_adf_dm.predict --cb_type dm
    train-sets/ref/cb_adf_dm.stderr
    pred-sets/ref/cb_adf_dm.predict

# Test 149: initial_weight option is used
echo "1 | feature:1" | {VW} -a --initial_weight 0.1 --initial_t 0.3
    train-sets/ref/initial_weight.stderr
    train-sets/ref/initial_weight.stdout

# Test 150:  Test --sparse_weights with 148
{VW} --cb_adf -d train-sets/cb_test.ldf -p cb_adf_dm.predict --cb_type dm --sparse_weights
    train-sets/ref/sparse.stderr

# Test 151: lrqfa
{VW} --lrqfa aa3 -d train-sets/0080.dat
    train-sets/ref/0151.stderr

# Test 152: daemon on the foreground test
./daemon-test.sh --foreground
    test-sets/ref/vw-daemon.stdout

# Test 153: marginal features
{VW} --marginal f  -d train-sets/marginal_features --noconstant --initial_numerator 0.5 --initial_denominator 1.0 --decay 0.001 --holdout_off -c -k --passes 100  --compete
    train-sets/ref/marginal_compete.stderr

# Test 154: ignore linear
{VW} -k --cache_file ignore_linear.cache --passes 10000 --holdout_off -d train-sets/0154.dat --noconstant --ignore_linear x -q xx
    train-sets/ref/ignore_linear.stderr

# Test 155: checking audit_regressor with --save_resume model
{VW} -d train-sets/0001.dat -i models/0097.model --save_resume --audit_regressor 0097.audit_regr
    train-sets/ref/0097.audit_regr.stderr
    train-sets/ref/0097.audit_regr

# Test 156: --cubic regression verification
./cubic-test.sh ${VW}

# Test 157: save_resume without --preserve_performce_counters does not alter performance counters over multiple passes
{VW} -d train-sets/0001.dat -f models/sr.model  --passes 2 -c -k  -P 50 --save_resume
    train-sets/ref/157.stderr

# Test 158: test decision service json parsing
{VW} -d train-sets/decisionservice.json --dsjson --cb_explore_adf --epsilon 0.2 --quadratic GT
    train-sets/ref/decisionservice.stderr

# Test 159: test --bootstrap & --binary interaction
{VW} -d train-sets/rcv1_mini.dat --bootstrap 5 --binary -c -k --passes 2
    train-sets/ref/bootstrap_and_binary.stderr

# Test 160: test --bootstrap & --oaa interaction
# (Also adds -q :: and -P1 to get & verify perfect predictions in 2nd pass)
{VW} -d train-sets/multiclass --bootstrap 4 --oaa 10 -q :: --leave_duplicate_interactions  -c -k --passes 2 --holdout_off -P1
    train-sets/ref/bootstrap_and_oaa.stderr

# Test 161: --classweight
{VW} -d train-sets/0001.dat --classweight 1:2,0:3.1,-1:5
    train-sets/ref/classweight.stderr

# Test 162: --classweight with multiclass
{VW} --oaa 10 -d train-sets/multiclass --classweight 4:0,7:0.1,2:10 --classweight 10:3
    train-sets/ref/classweight_multiclass.stderr

# Test 163: --classweight with multiclass
{VW} --recall_tree 10 -d train-sets/multiclass --classweight 4:0,7:0.1 --classweight 2:10,10:3
    train-sets/ref/classweight_recall_tree.stderr

# Test 164: cs_active low mellowness
{VW} --cs_active 3 -d ../test/train-sets/cs_test --cost_max 2 --mellowness 0.01 --simulation --adax
    train-sets/ref/cs_active_0.01.stderr

# Test 165: cs_active high mellowness
{VW} --cs_active 3 -d ../test/train-sets/cs_test --cost_max 2 --mellowness 1.0 --simulation --adax
    train-sets/ref/cs_active_1.0.stderr

# Test 166: hash_seed train
{VW} --hash_seed 5 -d train-sets/rcv1_mini.dat --holdout_off --passes 2 -f hash_seed5.model -c -k --ngram 2 -q ::
    train-sets/ref/hash_seed_train.stderr

# Test 167: hash_seed test
{VW} -d train-sets/rcv1_mini.dat -i hash_seed5.model -t
    train-sets/ref/hash_seed_test.stderr

# Test 168: test cb with dm
{VW} -d train-sets/rcv1_raw_cb_small.vw -t -i cb_dm.reg
    train-sets/ref/rcv1_raw_cb_dm_test.stderr

# Test 169: test cbify large
{VW} -d train-sets/rcv1_multiclass.dat --cbify 2 --epsilon 0.05
    train-sets/ref/rcv1_multiclass.stderr

# Test 170 cbify adf, epsilon-greedy
{VW} --cbify 10 --cb_explore_adf --epsilon 0.05 -d train-sets/multiclass
    train-sets/ref/cbify_epsilon_adf.stderr

# Test 171 cbify cs, epsilon-greedy
{VW} --cbify 3 --cbify_cs --epsilon 0.05 -d train-sets/cs_cb
    train-sets/ref/cbify_epsilon_cs.stderr

# Test 172 cbify adf cs, epsilon-greedy
{VW} --cbify 3 --cbify_cs --cb_explore_adf --epsilon 0.05 -d train-sets/cs_cb
    train-sets/ref/cbify_epsilon_cs_adf.stderr

# Test 173 cbify adf, regcb
{VW} --cbify 10 --cb_explore_adf --cb_type mtr --regcb --mellowness 0.01 -d train-sets/multiclass
    train-sets/ref/cbify_regcb.stderr

# Test 174 cbify adf, regcbopt
{VW} --cbify 10 --cb_explore_adf --cb_type mtr --regcbopt --mellowness 0.01 -d train-sets/multiclass
    train-sets/ref/cbify_regcbopt.stderr
