#!/bin/bash
# vim: set et ts=8 sts=4 sw=4 ai si:
#
# collect_product_metadata -- Takes a path to a product and collects the
# metadata needed to create a "solv" file. Also, it can create a database file
# for SPident, as SPprep would create it.
#
# Copyright (c) 2008 SuSE Linux Products GmbH, Nuernberg, Germany
#
# Author: Olaf Dabrunz <od@suse.de>
#

PRG="${0##*/}"
SPC="${PRG//?/ }"

SPPREP="/usr/share/misc/SPident/SPprep"

usage () {
    cat >&2 <<EOF
$PRG version 0.7.0

usage:
$PRG [-h|--help] [-s] [-f] [-c] [-k|-r] [-t<tags_regex>] [-p<packs_regex>]
$SPC <product_name> <product_directory> <target_directory>

Takes a path to a product and creates a tar archive with the metadata needed to
create a "solv" file.

The <product_name> is used as the name of the tar archive file. It is also used
as the product name and file name for the SPident database file.

<product_directory> is the name of the directory that contains the product
metadata. It may be of the form <product-dir>/DVD1 or just <product-dir>.

All files and directories in <target_directory> are removed. Then the files for
the archive will be copied to <target_directory>, the information in them is
filtered and the files are collected into the archive in
<target_directory>/<product_name>.tar.gz. The copied files are then removed.

-s:  if SPident is installed, also create a database file for SPident (using
     SPprep that comes with SPident)
-f:  do not filter product metadata
-c:  do not remove comments from filtered file
     (NOTE: unknown tags are treated as comments)
-k:  use "keep" filter for tags: the tags filter uses a list of tags to keep,
     and removes all other tags (this is the default)
-r:  use "remove" filter for tags: the tags filter uses a list of tags to
     remove, and keeps all tags not in this list
-t:  use specified perl <tags_regex> for tags to keep or remove, instead of
     built-in defaults
-p:  use specified perl <packs_regex> for packs to remove, instead of
     built-in defaults
-h:  show this message
EOF
}

usage_exit()    { usage ; exit 0 ; }
usage_error()   { echo -e "$PRG: error: $1" >&2 ; usage ; exit $2 ; }
die_perror()    { echo -e "$PRG: error: $1 ($(perror $2))" >&2 ; exit $2 ; }
die()           { echo -e "$PRG: error: $1" >&2 ; exit $2 ; }
echo_exec ()    { echo "$@" >&2 ; "$@" ; }
dprintf ()      { test "$1"0 -le "$debug"0 && { shift ; printf "$@" ; } ; }

TEMP=`getopt -a -o hsfckrt:p:d --long help -- "$@"` || exit
eval set -- "$TEMP"

# evaluate options
debug=
filter_metadata="yes"
filter_variant="keep"
spident="no"
tag_re=""
pack_re=""
include_comments="no"
PRODUCT_NAME=""
PRODUCT_DIR=""
TARGET_DIR=""
while [ $# -gt 0 ] ; do
    case "$1" in
        -d) echo "using the secret debug option... Arrrr!" >&2; debug=1 ;;
        -f) filter_metadata="no" ;;
        -k) filter_variant="keep" ;;
        -r) filter_variant="remove" ;;
        -t) tag_re="$2"; shift ;;
        -p) pack_re="$2"; shift ;;
        -c) include_comments="yes" ;;
        -s) spident="yes" ;;
        --) ;;
        -h|--help) usage_exit ;;
         *)
             if [ x"$PRODUCT_NAME" = x ] ; then
                 PRODUCT_NAME="$1"
             elif [ x"$PRODUCT_DIR" = x ] ; then
                 PRODUCT_DIR="$1"
             elif [ x"$TARGET_DIR" = x ] ; then
                 TARGET_DIR="$1"
             else
                 usage_error "unknown option: $1" 1
             fi
         ;;
    esac
    shift
done
test x"$PRODUCT_NAME" = x && \
    { usage_error "please specify a product name" 1 ; }
test x"$PRODUCT_DIR" = x && \
    { usage_error "please specify a product directory" 1 ; }
test x"$TARGET_DIR" = x && \
    { usage_error "please specify a target directory" 1 ; }

# ---------------------------------------------------------------
# metadata files to copy
metadata_files=(
    "[c]ontent{,.*}"
#    "[l]icense{,.*}"
    "[g]pg-pubkey-*"
    "[m]edia.1/{media,products{,.*}}"
    "[s]use/setup/descr/packages{,.gz}"
# this would copy localized strings (descriptions, ...), diskusage information
# and patterns as well:
#    "[s]use/setup/descr/{*.pat{,.*},packages{,.*},patterns{,.*}}"
)

# ---------------------------------------------------------------
# saving space by filtering the packages.gz file

# irrelevant packages to remove
# if the line in the Pkg: tag matches, the package entry is removed
# matches start and end at a word boundary
filter_packs=(
    # src and nosrc packages
    "src"
    "nosrc"
)

# keep at least one of these tags (used in both the "keep" and the "remove"
# filter)
# earlier tags in each list are prefered
# example: if a package does not have a "Tim" tag, we need to keep the "Req"
# tag (even if it is missing in the "keep" list or included in the "remove"
# list)
# Note: each string is a perl list reference "[Tim, Req]"
filter_keep_at_least_one_tag=(
    "[Tim, Req]"
)

# filter_keep_tags and filter_remove_tags together are the list of all known
# tags.

# tags to keep for the "keep" filter variant
# tags marked with a * are needed to use the identical() function in
# libsatsolver (as of 01-Dec-2008 -- this may change though)
# NOTE: during testing, keeping Requires doubled the total metadata size
# from 107 KB to 198 KB.
filter_keep_tags=(
    # Dependencies
#    "Prq"       # Prerequires
#    "Req"       # Requires (* only used in identical() when Buildtime is missing)
#    "Prv"       # Provides
#    "Con"       # Conflicts
#    "Enh"       # Enhances
#    "Obs"       # Obsoletes
#    "Rec"       # Recommends
#    "Sug"       # Suggests
#    "Sup"       # Supercedes

    # Others
#    "Aut"       # Author(s)
#    "Grp"       # Package Group
#    "Src"       # Source Package

#    "Cks"       # Checksum (whole package)
#    "Lic"       # License
#    "Siz"       # Size package-size installed-size
#    "Shr"       # Package built from this source package

    "Kwd"       # Keywords

    "Pkg"       # Package name (epoch?) vers rev arch *
    "Vnd"       # Vendor *
    "Tim"       # Buildtime *
    "Loc"       # Location media-number rpm-name
    "Ver"       # Version number for the packages file
)

# tags to remove for the "remove" filter variant
filter_remove_tags=(
    # Dependencies
    "Prq"       # Prerequires
    "Req"       # Requires (* only used in identical() when Buildtime is missing)
    "Prv"       # Provides
    "Con"       # Conflicts
    "Enh"       # Enhances
    "Obs"       # Obsoletes
    "Rec"       # Recommends
    "Sug"       # Suggests
    "Sup"       # Supercedes

    # Others
    "Aut"       # Author(s)
    "Grp"       # Package Group
    "Src"       # Source Package

    "Cks"       # Checksum (whole package)
    "Lic"       # License
    "Siz"       # Size package-size installed-size
    "Shr"       # Package built from this source package

##    "Kwd"       # Keywords

##    "Pkg"       # Package name (epoch?) vers rev arch *
##    "Vnd"       # Vendor *
##    "Tim"       # Buildtime *
##    "Loc"       # Location media-number rpm-name
##    "Ver"       # Version number for the packages file
)

#
# ---------------------------------------------------------------
# Functions
#

# regexify array of strings that do not contain spaces
regexify_array () {
    local re

    re="(${*/%/|}"
    re="${re%|})"
    re="${re// /}"
    echo "$re"
}

filter_packages_file () {
(
    cd "$TARGET_DIR/suse/setup/descr"       || die "cannot cd to $TARGET_DIR" $?

    all_tags_re=$(regexify_array "${filter_keep_tags[@]}" "${filter_remove_tags[@]}")

    if [ x"$tag_re" = x ] ; then
        if [ "$filter_variant" = "keep" ] ; then
            tag_re=$(regexify_array "${filter_keep_tags[@]}")
        else
            tag_re=$(regexify_array "${filter_remove_tags[@]}")
        fi
    else
        all_tags_re="($all_tags_re|$tag_re)"
    fi

    keep_one="(${filter_keep_at_least_one_tag[*]/%/,})"

    if [ x"$pack_re" = x ] ; then
        pack_re=$(regexify_array "${filter_packs[@]}")
    fi

    zcat packages.gz | perl ${debug:+-d} -we  '
        my $debug           = "'${debug:-0}'";
        my $all_tags_re     = "'$all_tags_re'";
        my $tag_re          = "'$tag_re'";
        my @keep_one        =  '"$keep_one"';
        my $pack_re         = "'$pack_re'";
        my $filter_variant  = "'$filter_variant'";
        my $keep_filter     = ($filter_variant eq "keep") ? 1 : 0;
        my $include_comments= "'$include_comments'";
        my $keep_comments   = ($include_comments eq "yes") ? 1 : 0;

        if ($keep_comments) {
            print STDERR "packages.gz: NOTE: comments are kept\n";
        } else {
            print STDERR "packages.gz: comments are removed\n";
        }

        my $line = <>;
        if ($line !~ /^=Ver:\s*2.0\b/) {
            $line =~ /^=Ver:\s*(\S+)/;
            print STDERR "error: wrong packages.gz file version: expected 2.0, got $1\n";
            exit 1;
        }
        print $line;

        # packinfo
        my %pi = (ord => []);
        # =Pkg ...                      key for each tag ("=$tag") or other item ("#$icnt")
        #      -> ("=Pkg: ...", ...)    containing a list of all lines for this tag
        # ord  -> (Pkg, Vnd, ...)       order of tags and other items for this pack in the file
        # keep -> (Tim, ...)            key for each tag that we may not remove (see $keep_one)

        my $removed_packs = 0;
        my $kept_packs = 0;
        my $removed = 0;
        my $kept = 0;

        sub item_print ($) {
            my ($item) = @_;

            # If the item is a tag, it collects all the lines for multiple
            # occurences of the tag. The "ord" key lists all these occurences,
            # but we want to print the collected lines of this tag only once.
            # Other items contain only a single line. It does not harm to treat
            # them in the same way as tags.
            return          if exists $pi{printed}->{$item};
            $pi{printed}->{$item} = 1;

            # for non-tag items, we may need to remove comments
            if ($keep_comments or $item =~ /^=/) {
                $kept++, print          foreach (@{$pi{$item}});
            } else {
                foreach (@{$pi{$item}}) {
                    $removed++, next    if /^#/;
                    $kept++;
                    print;
                }
            }
        }

        sub filter_and_print() {
            # if no package has been found, print collected other items
            if (not exists $pi{"=Pkg"}) {
                foreach my $item (@{$pi{ord}}) {
                    item_print($item);
                }
                return;
            }

            # skip matching package
            if ($pi{"=Pkg"}->[0] =~ /^=Pkg:.*\b$pack_re\b/) {
                foreach my $item (@{$pi{ord}}) {
                    $removed++          foreach (@{$pi{$item}});
                }
                $removed_packs++;
                return;
            }
            $kept_packs++;

            # flag tags that we may not remove
            foreach my $keep_one_list (@keep_one) {
                my $found = 0;
                foreach my $tag (@$keep_one_list) {
                    $pi{keep}->{"=$tag"} = 1, $found = 1, last  if exists $pi{"=$tag"};
                }
                if (not $found) {
                    printf(STDERR "WARNING: need at least one tag of (%s), but found none for package %s\n",
                            join(", ", @$keep_one_list), ($pi{"=Pkg"}->[0] =~ /=Pkg:\s+(.+)$/)[0]);
                }
            }

            # now filter and print the items
            foreach my $item (@{$pi{ord}}) {
                next    if not exists $pi{$item};

                if (not exists $pi{keep}->{$item} and   # if this is not an item we must keep and
                    $item =~ /^=/) {                    #     if it is a tag item, filter it

                    unless ($item =~ /^=$tag_re$/       # filter: skip this tag unless
                                ? $keep_filter          #   the tag matches and we use the "keep" filter or
                                : not $keep_filter) {   #   the tag does not match and we use the "remove" filter
                        $removed++      foreach (@{$pi{$item}});
                        next;
                    }
                }
                item_print($item);
            }
        }

        my $tag     = "";
        my $icnt    = 0;

        while(<>) {
            print STDERR "\t\t::$_"     if $debug;

            if (/^=Pkg:/) {
                # new package found
                # filter and print old package info
                filter_and_print();

                $tag    = "";
                $icnt   = 0;
                %pi     = (ord => []);
            }

                                      push(@{$pi{"=$1"}}, $_), $tag = "", next  if /^-($tag):/;         # save end of multi-line tag
                                      push(@{$pi{"=$tag"}}, $_),          next  if $tag;                # save contents of multi-line tag
            push(@{$pi{ord}}, "=$1"), push(@{$pi{"=$1"}}, $_), $tag = $1, next  if /^\+($all_tags_re):/;# save start of multi-line tag

            push(@{$pi{ord}}, "=$1"), push(@{$pi{"=$1"}}, $_),            next  if /^=($all_tags_re):/; # save single-line tag

            print(STDERR "WARNING: found unknown tag in line: $_")              if /^[+=](\w+):/;       # report unknown tag

            # save other item
            push(@{$pi{ord}}, "#$icnt");
            push(@{$pi{"#$icnt"}}, $_);
            $icnt++;
            next;
        }
        # print last package...
        filter_and_print();

        printf(STDERR "packages.gz: packages removed:    %7d (packs: %s)\n", $removed_packs, $pack_re);
        printf(STDERR "packages.gz: packages kept:       %7d\n", $kept_packs);
        printf(STDERR "packages.gz: total lines removed: %7d%s\n", $removed, $keep_filter ? "" : " (tags: $tag_re)");
        printf(STDERR "packages.gz: total lines kept:    %7d%s\n", $kept,    $keep_filter ? " (tags: $tag_re)" : "");
    ' | gzip -f -9 - > packages_filtered.gz                                 \
                                || die "cannot filter packages.gz" $?

    printf "packages.gz:          size:      %8d bytes\n" $(stat --printf="%s" packages.gz) >&2
    printf "packages.gz: filtered size:      %8d bytes\n" $(stat --printf="%s" packages_filtered.gz) >&2

    cat packages_filtered.gz > packages.gz                                  \
                                || die "error overwriting packages.gz" $?
    rm -f packages_filtered.gz
)
}

#
# ---------------------------------------------------------------
# Main program
#

rm -rf "$TARGET_DIR"

mkdir -p "$TARGET_DIR"          || die "cannot create directory $TARGET_DIR" $?

if [ x"${TARGET_DIR#/}" = x"$TARGET_DIR" ] ; then
    TARGET_DIR="$PWD/$TARGET_DIR"
fi

for subdir in "" CD1 DVD1 ; do
    PRODUCT_DIR="${PRODUCT_DIR%/$subdir}"
done

found=0
for subdir in "" CD1 DVD1 ; do
    test -d "$PRODUCT_DIR/$subdir/suse" && { found=1; break; }
done

test $found -eq 1               || die "cannot find <media>/suse directory" 1


# SPprep
if [ "$spident" = "no" ] ; then
    :
elif [ -x "$SPPREP" ] ; then
    echo "running SPprep..." >&2
    echo_exec "$SPPREP" --dir="$PRODUCT_DIR" --product="$PRODUCT_NAME"      \
                        --db-dir="$TARGET_DIR"                              \
                                || die "SPprep failed" $?
else
    echo "WARNING: SPprep not found, skipping!" >&2
fi

# copying metadata
(
    echo "copying files..." >&2

    cd "$PRODUCT_DIR/$subdir"   || die "cannot cd to $PRODUCT_DIR/$subdir" $?

    shopt -s nullglob
    eval tar cpf - "${metadata_files[@]}" |                                 \
                                ( cd "$TARGET_DIR" ; tar xvpf - >&2 )       \
                                || die "error copying files" $?

    echo >&2
)

# make sure packages is zipped
(
    if [ -e "$TARGET_DIR/suse/setup/descr/packages" ] ; then
        echo "zipping packages file..." >&2

        echo_exec gzip -9 "$TARGET_DIR/suse/setup/descr/packages"           \
                                || die "cannot zip packages file" $?

        echo >&2
    fi
)

# filtering metadata
if [ "$filter_metadata" = "yes" ] ; then
    echo "filtering metadata..." >&2

    filter_packages_file
else
    echo "NOT filtering metadata..." >&2

    printf "packages.gz:          size:      %8d bytes\n"                   \
        $(stat --printf="%s" $TARGET_DIR/suse/setup/descr/packages.gz) >&2
fi
echo >&2

# tar up
tarfile="$TARGET_DIR/$PRODUCT_NAME".tar.gz
PRODUCT_NAME_GLOB="[${PRODUCT_NAME:0:1}]${PRODUCT_NAME:1}"

(
    echo "creating metadata tar file..." >&2

    cd "$TARGET_DIR"            || die "cannot cd to $TARGET_DIR" $?

    shopt -s nullglob
    eval tar czpvf "$tarfile" "${metadata_files[@]}" "$PRODUCT_NAME_GLOB"   \
                                || die "error creating the tar archive" $?
)

# 200 bytes or less looks empty (a '.' directory alone takes 104 bytes)
test $(stat --printf="%s" "$tarfile") -gt 200                               \
                                || die "tar file creation failed" 1
echo >&2

# clean up
(
    echo "cleaning up..." >&2

    cd "$TARGET_DIR"            || die "cannot cd to $TARGET_DIR" $?

    shopt -s nullglob
    eval rm -rf "${metadata_files[@]%%/*}" "$PRODUCT_NAME_GLOB"
)

