#!/usr/bin/python

import os
import sys
import time
import re
import textwrap

from debian_bundle.debfile import DebFile

# Hash table to store our file info.
# The key is a string, the file's MD5 sum.
c = {}

# Notice a file as duplicated above that many duplications.
DUP_CUTOFF = 1

# Only print the top offenders
REPORT_MAX = 50

ARCH = 'amd64'

class FileContainer(object):
    "Simple container to store one file's info."
    def __init__(self, name, pkgfile, md5sum):
        self.name = name
        self.pkg = os.path.basename(pkgfile).split('_')[0]
        self.md5sum = md5sum

    def __repr__(self):
        return "%s: %s" % (self.pkg, self.name)

class PkgListWrapper(textwrap.TextWrapper):
    "A customized textwrap class that doesn't break words at '-'."
    def __init__(self):
        self.width = 80
        self.initial_indent = "  "
        self.subsequent_indent = "  "
        self.expand_tabs = False
        self.replace_whitespace = False
        self.fix_sentence_endings = False
        self.break_long_words = False

    def _split(self, text):
        return re.split('(\s+)', text)


def process_package(pkgfile, counters):
    """Process one .deb file.

    If the file has a DEBIAN/md5sums control file, loop over it,
    building FileContainer objects from the info.  The objects
    are then added to the global hash table under their hash key.
    """
    try:
        deb = DebFile(pkgfile)
    except:
        print >>sys.stderr, "Error scanning %s: %s" % \
            (pkgfile, sys.exc_info()[1])
        pass
    else:
        if 'md5sums' in deb.control:
            md5sums = deb.md5sums()
            counters['pkgs'] += 1
            counters['total_files'] += len(md5sums)
            for f, md5 in md5sums.items():
                file = FileContainer(f, pkgfile, md5)
                if c.has_key(md5):
                    c[md5].append(file)
                else:
                    c[md5] = [file]

def find_duplicates():
    dupes = {}
    for h in c:
        if len(c[h]) > DUP_CUTOFF:
            dupes[h] = c[h]
    return dupes

def process_dir(dir):
    counters = { 'pkgs': 0, 'total_files': 0 }
    print "Scanning..."
    start_time = time.time()
    for root, dirs, files in os.walk(dir):
        for file in [f for f in files if f.endswith("_%s.deb" % ARCH)
                     or f.endswith("_all.deb")]:
            process_package(os.path.join(root, file), counters)
    end_time = time.time()
    print "%d packages processed in %.2f seconds" \
        " totalling %d files (%d files/pkg)" \
        % (counters['pkgs'], end_time - start_time, counters['total_files'], 
           counters['total_files'] / counters['pkgs'])
    print "Looking for duplicates..."
    start_time = time.time()
    d = find_duplicates()
    end_time = time.time()
    print "Found %d duplicates in %.2f seconds\n" \
        % (len(d), end_time - start_time)
    for h, files in \
            sorted(d.items(), None, lambda x: len(x[1]), True)[:REPORT_MAX]:
        pkg_list = set([f.pkg for f in files])
        pkg_count = len(pkg_list)
        pkg_names = set([f.name for f in files][:10])
        print "%s present %d times in %d package%s:\n%s" \
            % (h, len(files), pkg_count, pkg_count > 1 and 's' or '',
               PkgListWrapper().fill(', '.join(sorted(pkg_list))))
        print "Sample names:\n  %s\n" % '\n  '.join(pkg_names)

if __name__ == '__main__':
    process_dir(sys.argv[1])
