dotfiles/scripts/dedupbw
2021-01-13 05:18:50 -06:00

79 lines
2.3 KiB
Python
Executable file

#!/usr/bin/env python3
# dedup.py
# Removes duplicates from Bitwarden export .csv
# 2019-02-09 5erif
import sys
import hashlib
from urllib.parse import urlparse
# Field ordinals in Bitwarden CSV
FOLDER = 0
FAVORITE = 1
TYPE = 2
NAME = 3
NOTES = 4
FIELDS = 5
URI = 6
USERNAME = 7
PASSWORD = 8
TOTP = 9
def main(argv):
if len(argv) < 1:
print('Missing input file path')
sys.exit(1)
in_file_path = argv[0]
out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
completed_lines_hash = set()
line_number = -1
write_count = 0
cache = ''
out_file = open(out_file_path, 'w', encoding = 'utf8')
rem_file = open(rem_file_path, 'w', encoding = 'utf8')
for line in open(in_file_path, 'r', encoding = 'utf8'):
line_number += 1
fields = line.split(',')
if len(fields) < 10:
# Add previous line if short
line = cache.strip('\n') + line
cache = line
fields = line.split(',')
if len(fields) > 9:
print(f'Recovered with line {line_number}:\n{line}')
cache = ''
else:
print(f'Missing fields in line {line_number}:\n{line}')
rem_file.write(line)
continue
else:
cache = ''
if line_number != 0:
domain = urlparse(fields[URI]).netloc
if len(domain) > 0:
fields[URI] = domain
token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
if hashValue not in completed_lines_hash:
out_file.write(line)
completed_lines_hash.add(hashValue)
write_count += 1
else:
rem_file.write(line)
# Uncomment for verbose mode
# print(f'Skipping duplicate on line {line_number}:\n{line}')
out_file.close()
rem_file.close()
dup_count = line_number - write_count
print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
print(f'\n{dup_count} duplicates saved to {rem_file_path}')
if __name__ == "__main__":
main(sys.argv[1:])