reddit-image-wall-getter/reddit_imgs/hashit2.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import datetime
import hashlib
import json
import multiprocessing
import time
import traceback
from pathlib import Path
from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar

import colored

from .system.cmdline_parser import parse_cmdline
from .system.flattener import flatten_generator
from .system.format_file_size import format_power10
from .system.hexhashof import hexhashof

WORKERS = 4

T = TypeVar('T')


def cmdline(encoded_args: str = None):
    if encoded_args is None:
        return run_with_config()
    else:
        return parse_cmdline(run_with_config, encoded_args)


def run_with_config():
    main()


def get_first_elem(e): return e[0]
def get_second_elem(e): return e[1]
def reverse_key_value(e): return e[1], e[0]


class FileHash:
    def __init__(self, file: Path, hash: str):
        self.file = file
        self.hash = hash


class FileHasher(multiprocessing.Process):
    def __init__(self, file_queue, hash_queue):
        multiprocessing.Process.__init__(self)
        self.file_queue = file_queue
        self.hash_queue = hash_queue

    def run(self):
        proc_name = self.name
        try:
            while True:
                seq, total, file = self.file_queue.get()
                if file is not None:
                    print(colored.stylize(
                        f'{proc_name}:{colored.fg("cyan")} {seq}/{total}:{colored.attr("reset")}{colored.attr("dim")} {file}', [colored.fg('yellow')]))
                    self.hash_queue.put(FileHash(
                        str(file),
                        hexhashof(Path(file).read_bytes(), hashlib.sha256)
                    ))
                else:
                    print(colored.stylize(f'{proc_name}: Exiting', [
                          colored.fg('red'), colored.attr('bold')]))
                    self.hash_queue.put(None)
                    break
        except:
            traceback.print_exc()
            try:
                self.hash_queue.put(None)
            except:
                pass
        return


def main():
    print(colored.stylize('Reading files...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    hashes_path = Path('i_gdl_hashes.txt')
    if not hashes_path.exists():
        hashes_path.write_text('')
    hashes_text = list(filter(len, hashes_path.read_text().splitlines()))
    hashes: Set[str, str] = frozenset(map(reverse_key_value, filter(
        get_second_elem, map(lambda a: a.split('|', 1), hashes_text))))
    # hashes -> link ; hash
    files_size: Dict[str, int] = json.loads(Path('i_gdl_fsz.json').read_text())
    hashes = frozenset(filter(lambda a: files_size.get(a[0], 0), hashes))
    non_empty_files_size_dict = dict(
        filter(lambda a: a[1], files_size.items()))
    non_empty_files = frozenset(non_empty_files_size_dict.keys())
    downloaded_files_to_be_hashed = non_empty_files.difference(
        map(get_first_elem, hashes))
    total_file_size_bytes = sum(files_size.values())
    total_file_size = format_power10(total_file_size_bytes)
    hashed_size_bytes = sum(
        map(lambda l: files_size[l], map(get_first_elem, hashes)))
    hashes = set(hashes)
    total = len(downloaded_files_to_be_hashed)
    files_queue = multiprocessing.Queue()
    hashes_queue = multiprocessing.Queue()
    print(colored.stylize('Filling queues...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    for enumeration, downloaded_file_to_be_hashed in enumerate(sorted(downloaded_files_to_be_hashed)):
        files_queue.put(
            (enumeration+1, total, str(downloaded_file_to_be_hashed)))
        del enumeration
    del total
    for _ in range(WORKERS):
        files_queue.put((0, 0, None))
    print(colored.stylize('Starting processes...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    workers = list()
    for _ in range(WORKERS):
        worker = FileHasher(files_queue, hashes_queue)
        workers.append(worker)
    for worker in workers:
        worker.start()
        # raise Exception('-'*50)
    print(colored.stylize('Listening queues...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    timings: List[Tuple[float, float]] = list()
    with hashes_path.open('at') as hashes_handler:
        active_workers = WORKERS
        while active_workers > 0:
            file_hash: FileHash = hashes_queue.get()
            if file_hash is not None:
                hashed_size_bytes += files_size[file_hash.file]
                progress_pct = hashed_size_bytes / max(1, total_file_size_bytes)
                timings.append((progress_pct, time.time()))
                while len(timings) > 128:
                    del timings[0]
                end_prediction = ''
                if len(timings) > 1:
                    dp = timings[0][0] - timings[-1][0]
                    dt = timings[0][1] - timings[-1][1]
                    secs_pred = (1 - progress_pct) * (dt / dp)
                    td = datetime.timedelta(seconds=secs_pred)
                    end_prediction = (f' - {td}' +
                                      f' - {datetime.datetime.now() + td}')
                print(colored.stylize(
                    '%11.6f%% - %s of %s%s' % (
                        100*progress_pct,
                        format_power10(hashed_size_bytes),
                        total_file_size,
                        end_prediction),
                    [colored.fg('light_green'), colored.attr('bold')]))
                hashes_handler.write(f'{file_hash.hash}|{file_hash.file}\n')
            else:
                active_workers -= 1
            del file_hash
        del active_workers
    print(colored.stylize('Stopping processes...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    for worker in workers:
        worker.join()
        del worker
    del workers
    files_queue.close()
    files_queue.join_thread()
    hashes_queue.close()
    hashes_queue.join_thread()
    print(colored.stylize('Sorting output file...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    hashes_path.write_text(
        '\n'.join(
            list(filter(
                lambda a: files_size.get(a.split('|', 1)[1], 0),
                sorted(hashes_path.read_text().splitlines())
            ))
        ) + '\n')
    print(colored.stylize('Pointing out repeated hashes...', [
          colored.fg('light_cyan'), colored.attr('bold')]))
    repeated_hashes = dict()
    for hashed, location in map(lambda a: a.split('|', 1), sorted(hashes_path.read_text().splitlines())):
        if hashed not in repeated_hashes:
            repeated_hashes[hashed] = list()
        repeated_hashes[hashed].append(location)
    Path('i_gdl_rh.json').write_text(json.dumps(
        dict(sorted(
            list(filter(lambda a: len(a[1]) > 1, repeated_hashes.items())),
            key=lambda a: (-len(a[1]), a[0])
        )),
        indent=1,
    ))
    print(colored.stylize(
        'Done', [colored.fg('light_cyan'), colored.attr('bold')]))
big refactoring 2020-07-20 01:54:26 +00:00			`#!/usr/bin/env python3`
			`# -- encoding: utf-8 --`

refactoring 2020-11-06 00:08:05 +00:00			`import datetime`
big refactoring 2020-07-20 01:54:26 +00:00			`import hashlib`
			`import json`
			`import multiprocessing`
refactoring 2020-11-06 00:08:05 +00:00			`import time`
big refactoring 2020-07-20 01:54:26 +00:00			`import traceback`
			`from pathlib import Path`
			`from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar`

			`import colored`

			`from .system.cmdline_parser import parse_cmdline`
			`from .system.flattener import flatten_generator`
			`from .system.format_file_size import format_power10`
			`from .system.hexhashof import hexhashof`

			`WORKERS = 4`

			`T = TypeVar('T')`


			`def cmdline(encoded_args: str = None):`
			`if encoded_args is None:`
			`return run_with_config()`
			`else:`
			`return parse_cmdline(run_with_config, encoded_args)`


			`def run_with_config():`
			`main()`


			`def get_first_elem(e): return e[0]`
			`def get_second_elem(e): return e[1]`
			`def reverse_key_value(e): return e[1], e[0]`


			`class FileHash:`
			`def __init__(self, file: Path, hash: str):`
			`self.file = file`
			`self.hash = hash`


			`class FileHasher(multiprocessing.Process):`
			`def __init__(self, file_queue, hash_queue):`
			`multiprocessing.Process.__init__(self)`
			`self.file_queue = file_queue`
			`self.hash_queue = hash_queue`

			`def run(self):`
			`proc_name = self.name`
			`try:`
			`while True:`
			`seq, total, file = self.file_queue.get()`
			`if file is not None:`
			`print(colored.stylize(`
			`f'{proc_name}:{colored.fg("cyan")} {seq}/{total}:{colored.attr("reset")}{colored.attr("dim")} {file}', [colored.fg('yellow')]))`
			`self.hash_queue.put(FileHash(`
			`str(file),`
			`hexhashof(Path(file).read_bytes(), hashlib.sha256)`
			`))`
			`else:`
			`print(colored.stylize(f'{proc_name}: Exiting', [`
			`colored.fg('red'), colored.attr('bold')]))`
			`self.hash_queue.put(None)`
			`break`
			`except:`
			`traceback.print_exc()`
			`try:`
			`self.hash_queue.put(None)`
			`except:`
			`pass`
			`return`


			`def main():`
			`print(colored.stylize('Reading files...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
			`hashes_path = Path('i_gdl_hashes.txt')`
			`if not hashes_path.exists():`
			`hashes_path.write_text('')`
			`hashes_text = list(filter(len, hashes_path.read_text().splitlines()))`
			`hashes: Set[str, str] = frozenset(map(reverse_key_value, filter(`
			`get_second_elem, map(lambda a: a.split('\|', 1), hashes_text))))`
			`# hashes -> link ; hash`
			`files_size: Dict[str, int] = json.loads(Path('i_gdl_fsz.json').read_text())`
			`hashes = frozenset(filter(lambda a: files_size.get(a[0], 0), hashes))`
			`non_empty_files_size_dict = dict(`
			`filter(lambda a: a[1], files_size.items()))`
			`non_empty_files = frozenset(non_empty_files_size_dict.keys())`
			`downloaded_files_to_be_hashed = non_empty_files.difference(`
			`map(get_first_elem, hashes))`
			`total_file_size_bytes = sum(files_size.values())`
			`total_file_size = format_power10(total_file_size_bytes)`
			`hashed_size_bytes = sum(`
			`map(lambda l: files_size[l], map(get_first_elem, hashes)))`
			`hashes = set(hashes)`
			`total = len(downloaded_files_to_be_hashed)`
			`files_queue = multiprocessing.Queue()`
			`hashes_queue = multiprocessing.Queue()`
			`print(colored.stylize('Filling queues...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
			`for enumeration, downloaded_file_to_be_hashed in enumerate(sorted(downloaded_files_to_be_hashed)):`
			`files_queue.put(`
			`(enumeration+1, total, str(downloaded_file_to_be_hashed)))`
			`del enumeration`
			`del total`
			`for _ in range(WORKERS):`
			`files_queue.put((0, 0, None))`
			`print(colored.stylize('Starting processes...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
			`workers = list()`
			`for _ in range(WORKERS):`
			`worker = FileHasher(files_queue, hashes_queue)`
			`workers.append(worker)`
			`for worker in workers:`
			`worker.start()`
			`# raise Exception('-'*50)`
			`print(colored.stylize('Listening queues...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
refactoring 2020-11-06 00:08:05 +00:00			`timings: List[Tuple[float, float]] = list()`
big refactoring 2020-07-20 01:54:26 +00:00			`with hashes_path.open('at') as hashes_handler:`
			`active_workers = WORKERS`
			`while active_workers > 0:`
			`file_hash: FileHash = hashes_queue.get()`
			`if file_hash is not None:`
			`hashed_size_bytes += files_size[file_hash.file]`
refactoring 2020-11-06 00:08:05 +00:00			`progress_pct = hashed_size_bytes / max(1, total_file_size_bytes)`
			`timings.append((progress_pct, time.time()))`
			`while len(timings) > 128:`
			`del timings[0]`
			`end_prediction = ''`
			`if len(timings) > 1:`
			`dp = timings[0][0] - timings[-1][0]`
			`dt = timings[0][1] - timings[-1][1]`
			`secs_pred = (1 - progress_pct) * (dt / dp)`
			`td = datetime.timedelta(seconds=secs_pred)`
			`end_prediction = (f' - {td}' +`
			`f' - {datetime.datetime.now() + td}')`
big refactoring 2020-07-20 01:54:26 +00:00			`print(colored.stylize(`
refactoring 2020-11-06 00:08:05 +00:00			`'%11.6f%% - %s of %s%s' % (`
			`100*progress_pct,`
big refactoring 2020-07-20 01:54:26 +00:00			`format_power10(hashed_size_bytes),`
refactoring 2020-11-06 00:08:05 +00:00			`total_file_size,`
			`end_prediction),`
big refactoring 2020-07-20 01:54:26 +00:00			`[colored.fg('light_green'), colored.attr('bold')]))`
			`hashes_handler.write(f'{file_hash.hash}\|{file_hash.file}\n')`
			`else:`
			`active_workers -= 1`
			`del file_hash`
			`del active_workers`
			`print(colored.stylize('Stopping processes...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
			`for worker in workers:`
			`worker.join()`
			`del worker`
			`del workers`
			`files_queue.close()`
			`files_queue.join_thread()`
			`hashes_queue.close()`
			`hashes_queue.join_thread()`
			`print(colored.stylize('Sorting output file...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
			`hashes_path.write_text(`
			`'\n'.join(`
			`list(filter(`
			`lambda a: files_size.get(a.split('\|', 1)[1], 0),`
			`sorted(hashes_path.read_text().splitlines())`
			`))`
			`) + '\n')`
			`print(colored.stylize('Pointing out repeated hashes...', [`
			`colored.fg('light_cyan'), colored.attr('bold')]))`
			`repeated_hashes = dict()`
			`for hashed, location in map(lambda a: a.split('\|', 1), sorted(hashes_path.read_text().splitlines())):`
			`if hashed not in repeated_hashes:`
			`repeated_hashes[hashed] = list()`
			`repeated_hashes[hashed].append(location)`
			`Path('i_gdl_rh.json').write_text(json.dumps(`
			`dict(sorted(`
			`list(filter(lambda a: len(a[1]) > 1, repeated_hashes.items())),`
			`key=lambda a: (-len(a[1]), a[0])`
			`)),`
			`indent=1,`
			`))`
			`print(colored.stylize(`
			`'Done', [colored.fg('light_cyan'), colored.attr('bold')]))`