#!/usr/bin/env python3 # dedup.py # Removes duplicates from Bitwarden export .csv # 2019-02-09 5erif import sys import hashlib from urllib.parse import urlparse # Field ordinals in Bitwarden CSV FOLDER = 0 FAVORITE = 1 TYPE = 2 NAME = 3 NOTES = 4 FIELDS = 5 URI = 6 USERNAME = 7 PASSWORD = 8 TOTP = 9 def main(argv): if len(argv) < 1: print('Missing input file path') sys.exit(1) in_file_path = argv[0] out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv' rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv' completed_lines_hash = set() line_number = -1 write_count = 0 cache = '' out_file = open(out_file_path, 'w', encoding = 'utf8') rem_file = open(rem_file_path, 'w', encoding = 'utf8') for line in open(in_file_path, 'r', encoding = 'utf8'): line_number += 1 fields = line.split(',') if len(fields) < 10: # Add previous line if short line = cache.strip('\n') + line cache = line fields = line.split(',') if len(fields) > 9: print(f'Recovered with line {line_number}:\n{line}') cache = '' else: print(f'Missing fields in line {line_number}:\n{line}') rem_file.write(line) continue else: cache = '' if line_number != 0: domain = urlparse(fields[URI]).netloc if len(domain) > 0: fields[URI] = domain token = fields[URI] + fields[USERNAME] + fields[PASSWORD] hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest() if hashValue not in completed_lines_hash: out_file.write(line) completed_lines_hash.add(hashValue) write_count += 1 else: rem_file.write(line) # Uncomment for verbose mode # print(f'Skipping duplicate on line {line_number}:\n{line}') out_file.close() rem_file.close() dup_count = line_number - write_count print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved') print(f'\n{dup_count} duplicates saved to {rem_file_path}') if __name__ == "__main__": main(sys.argv[1:])