#!/usr/bin/perl

# usage:
# git clone --bare git@git.zulip.net:eng/zulip.git
# cd zulip.git
# git fast-export --export-marks=../zulip.em --progress=1000 --all > ../zulip.fe
# git init --bare ../zulip-zanitized.git
# cd ../zulip-zanitized.git
# zanitizer ../zulip.fe ../zulip.em | git fast-import --quiet

use strict;
use warnings;

use Digest::SHA qw(sha1_hex);
use FindBin;

use lib $FindBin::Bin;
use zanitizer_config;

sub eq_tree {
    my ( $a, $b ) = @_;
    !( grep { !exists $$b{$_} || $$a{$_} ne $$b{$_} } keys %$a )
      && !( grep { !exists $$a{$_} } keys %$b );
}

my ( $fast_export_file, $export_marks_file ) = @ARGV;

my %export_marks = ();
if ( defined $export_marks_file ) {
    open EXPORT_MARKS, '<', $export_marks_file
      or die "cannot open $export_marks_file: $!";
    %export_marks = map { split } <EXPORT_MARKS>;
    close EXPORT_MARKS;
}

my %mark_map      = ();
my %blob_mark     = ();
my %ref_commit    = ();
my %commit_tree   = ();
my %scrubbed_blob = ();
my %scrubbed_file = ();
my %deleted_file  = ();
my %renamed_file  = ();

open FAST_EXPORT, '<', $fast_export_file
  or die "cannot open $fast_export_file: $!";
$_ = <FAST_EXPORT>;
while ( defined $_ ) {
    if ( $_ eq "blob\n" ) {
        my ($mark) = <FAST_EXPORT> =~ /^mark (\S*)\n$/s or die;
        my ($len)  = <FAST_EXPORT> =~ /^data (\d+)\n$/s or die;
        read( FAST_EXPORT, my $data, $len ) == $len or die;
        $_ = $data;
        scrub_text;
        if ( $_ ne $data ) {
            $scrubbed_blob{$mark} = 1;
            $data = $_;
        }
        <FAST_EXPORT> eq "\n" or die;

        my $hash = sha1_hex($data);
        if ( exists $blob_mark{$hash} ) {
            $mark_map{$mark} = $blob_mark{$hash};
        }
        else {
            $blob_mark{$hash} = $mark_map{$mark} = $mark;
            print "blob\nmark $mark\ndata ", length $data, "\n", $data, "\n";
        }
    }
    elsif (/^reset (?'ref'.*)\n$/s) {
        my $ref = $+{ref};
        $_ = <FAST_EXPORT>;
        my $from = undef;
        while (1) {
            if ( $_ eq "\n" ) {
                $_ = <FAST_EXPORT>;
                last;
            }
            elsif ( my ($from_) = /^from (?'from'.*)\n$/s ) {
                $from = $+{from};
            }
            else {
                # The trailing LF on reset is optional
                last;
            }
            $_ = <FAST_EXPORT>;
        }

        $ref_commit{$ref} = $mark_map{from};
        print "reset $ref\n";
        print "from $mark_map{$from}\n"
          if defined $from && defined $mark_map{$from};
        print "\n";

        next;
    }
    elsif (/^commit (?'ref'.*)\n$/s) {
        my $ref         = $+{ref};
        my ($mark)      = <FAST_EXPORT> =~ /^mark (\S*)\n$/s     or die;
        my ($author)    = <FAST_EXPORT> =~ /^author (.*)\n$/s    or die;
        my ($committer) = <FAST_EXPORT> =~ /^committer (.*)\n$/s or die;
        my ($len)       = <FAST_EXPORT> =~ /^data (\d+)\n$/s     or die;
        read FAST_EXPORT, my ($data), $len;
        $_ = <FAST_EXPORT>;
        my $from = undef;

        if (/^from (?'from'.*)\n$/s) {
            $from = $+{from};
            $_    = <FAST_EXPORT>;
        }
        my $base  = defined $from ? $mark_map{$from} : $ref_commit{ref};
        my @merge = ();
        while (/^merge (?'mark'\S*)\n$/s) {
            die "unimplemented case" if !defined $from;
            push @merge, $+{mark};
            $_ = <FAST_EXPORT>;
        }

        # git fast-export incorrectly writes M before D when replacing
        # a symlink with a directory.  We move every D before every M
        # to work around this bug.
        my @delete = ();
        my @modify = ();
        while (1) {
            if ( $_ eq "\n" ) {
                last;
            }
            elsif (/^D (?'file'.*)\n$/s) {
                $_ = $+{file};
                scrub_filename;
                push @delete, { %+, file => $_ } if defined $_;
            }
            elsif (/^M (?'mode'\d+) (?'mark'\S+) (?'file'.*)\n$/s) {
                $_ = $+{file};
                scrub_filename;
                if ( defined $_ ) {
                    $renamed_file{ $+{file} } = $_ if $_ ne $+{file};
                    $scrubbed_file{$_} = 1 if exists $scrubbed_blob{ $+{mark} };
                    push @modify, { %+, file => $_ };
                }
                else {
                    $deleted_file{ $+{file} } = 1;
                }
            }
            else {
                die "unhandled command in commit: $_";
            }
            $_ = <FAST_EXPORT>;
        }
        my $base_tree = defined $base ? $commit_tree{$base} : {};
        my %tree      = %$base_tree;
        delete $tree{ $$_{file} } for @delete;
        $tree{ $$_{file} } = "$$_{mode} $mark_map{$$_{mark}}" for @modify;

        if ( eq_tree( \%tree, $base_tree )
            && !( grep { defined $mark_map{$_} } @merge ) )
        {
            $ref_commit{$ref} = $mark_map{$mark} = $base;
        }
        else {
            $ref_commit{$ref}   = $mark_map{$mark} = $mark;
            $commit_tree{$mark} = \%tree;
            $_                  = $data;
            scrub_text;
            if ( exists $export_marks{$mark} ) {
                $_ .= "\n" until /\n\n$/;
                $_ .= "(imported from commit $export_marks{$mark})\n";
            }
            print
"commit $ref\nmark $mark\nauthor $author\ncommitter $committer\ndata ",
              length $_, "\n", $_;
            if ( defined $from ) {
                die "unimplemented case" if !defined $mark_map{$from};
                print "from $mark_map{$from}\n";
            }
            for (@merge) {
                print "merge $mark_map{$_}\n" if defined $mark_map{$_};
            }
            print "D $$_{file}\n"                                for @delete;
            print "M $$_{mode} $mark_map{$$_{mark}} $$_{file}\n" for @modify;
            print "\n";
        }
    }
    elsif (/^progress /) {
        print $_;
    }
    else {
        die "unhandled command: $_";
    }
    $_ = <FAST_EXPORT>;
}
close FAST_EXPORT;

print STDERR "Deleted files:\n";
print STDERR "  $_\n" for sort keys %deleted_file;
print STDERR "Renamed files:\n";
print STDERR "  $_ => $renamed_file{$_}\n" for sort keys %renamed_file;
print STDERR "Scrubbed files:\n";
print STDERR "  $_\n" for sort keys %scrubbed_file;
