clawd/tmp/contacts/postprocess.py

156 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
Post-process merged.vcf:
1. Remove contacts with no email AND no phone
2. Remove contacts with no name (FN or ORG) — list their emails for review
3. Convert Dutch phone numbers (0X...) to international format (+31...)
"""
import re, sys
INPUT = "/home/johan/clawd/tmp/contacts/merged.vcf"
OUTPUT = "/home/johan/clawd/tmp/contacts/final.vcf"
# ── vCard parser (simple block-based) ─────────────────────────────────────────
def parse_blocks(path):
with open(path, encoding='utf-8', errors='replace') as f:
raw = re.sub(r'\r?\n[ \t]', '', f.read())
blocks = []
for block in re.split(r'(?=BEGIN:VCARD)', raw, flags=re.I):
block = block.strip()
if block.upper().startswith('BEGIN:VCARD'):
blocks.append(block)
return blocks
def block_lines(block):
return [l for l in block.splitlines() if l.strip() and l.upper() not in ('BEGIN:VCARD', 'END:VCARD', 'VERSION:3.0')]
def get_field_val(block, prefix):
"""First value matching field prefix."""
for line in block.splitlines():
k, _, v = line.partition(':')
if re.match(r'(ITEM\d+\.)?' + prefix, k.strip(), re.I):
return v.strip()
return ''
def has_email(block):
return bool(re.search(r'^(ITEM\d+\.)?EMAIL\b', block, re.I | re.M))
def has_phone(block):
return bool(re.search(r'^(ITEM\d+\.)?TEL\b', block, re.I | re.M))
def get_name(block):
fn = get_field_val(block, 'FN')
if fn:
return fn
org = get_field_val(block, 'ORG')
return org
def get_emails(block):
emails = []
for line in block.splitlines():
k, _, v = line.partition(':')
if re.match(r'(ITEM\d+\.)?EMAIL', k.strip(), re.I) and v.strip():
emails.append(v.strip().lower())
return emails
# ── Dutch phone normaliser ─────────────────────────────────────────────────────
def nl_to_intl(phone):
"""
Convert Dutch local format to E.164-ish international format.
Rules:
- Already international (+XX or 00XX) → leave alone
- US numbers (10 digit, starts with 1 after stripping) → leave alone
- 10-digit Dutch numbers starting with 0 → replace leading 0 with +31
Handles spaces/dashes/dots in input, preserves spacing style.
"""
p = phone.strip()
# Already international
if p.startswith('+') or p.startswith('00'):
return p
# Strip formatting to count digits
digits_only = re.sub(r'\D', '', p)
# Must start with 0 and be 10 digits to be Dutch local
if not digits_only.startswith('0') or len(digits_only) != 10:
return p
# Replace leading 0 with +31, preserve the rest of the formatting
# e.g. "06 23 123 456" → "+31 6 23 123 456"
# e.g. "0646438755" → "+31646438755"
# e.g. "020-1234567" → "+3120-1234567" (Amsterdam landline)
converted = '+31' + p[1:] # drop leading '0', prepend +31
return converted
def convert_phones_in_block(block):
lines = []
for line in block.splitlines():
k, _, v = line.partition(':')
if re.match(r'(ITEM\d+\.)?TEL', k.strip(), re.I) and v.strip():
new_v = nl_to_intl(v.strip())
if new_v != v.strip():
line = f'{k}:{new_v}'
lines.append(line)
return '\n'.join(lines)
# ── main ──────────────────────────────────────────────────────────────────────
def main():
blocks = parse_blocks(INPUT)
print(f"Input: {len(blocks)} contacts")
kept = []
removed_no_contact = 0
removed_no_name_emails = [] # emails of nameless contacts
phones_converted = 0
for block in blocks:
# Step 1: must have email or phone
if not has_email(block) and not has_phone(block):
removed_no_contact += 1
continue
# Step 2: must have a name
name = get_name(block)
if not name or not name.strip():
emails = get_emails(block)
removed_no_name_emails.extend(emails if emails else ['(no email — phone only)'])
continue
# Step 3: Dutch phone normalisation
new_block = convert_phones_in_block(block)
if new_block != block:
phones_converted += 1
block = new_block
kept.append(block)
print(f"Removed (no email+phone): {removed_no_contact}")
print(f"Removed (no name): {len(removed_no_name_emails)} email(s) from nameless contacts")
print(f"Phone numbers converted: {phones_converted}")
print(f"Output: {len(kept)} contacts")
# Write final vcf
with open(OUTPUT, 'w', encoding='utf-8') as f:
for block in kept:
if not block.startswith('BEGIN:VCARD'):
block = 'BEGIN:VCARD\nVERSION:3.0\n' + block
if not block.endswith('END:VCARD'):
block = block + '\nEND:VCARD'
f.write(block + '\n\n')
print(f"\nWritten to: {OUTPUT}")
if removed_no_name_emails:
print(f"\n── Nameless contacts (emails for review) ──────────────────")
for e in sorted(set(removed_no_name_emails)):
print(f" {e}")
if __name__ == '__main__':
main()