-
Notifications
You must be signed in to change notification settings - Fork 3
/
list_senders.py
executable file
·68 lines (58 loc) · 2.21 KB
/
list_senders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
import sys
req_version = (3, 0)
cur_version = sys.version_info
if cur_version < req_version:
print("Error! you need to use this script with Python 3+")
exit(0)
import mailbox
import argparse
from pathlib import Path
from collections import defaultdict
import re
DEFAULT_THRESHOLD = 50
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("mbox_path", help="the path of the mbox file")
parser.add_argument("--threshold", help="number of mails to use as the threshold", default=DEFAULT_THRESHOLD, type=int)
parser.add_argument("--group-by-email", help="mails will be grouped based on the email present in the FROM field. This can be hasardous, as formats may differ.", default=False, action="store_true")
return parser.parse_args()
def open_mbox_file():
my_file = Path(args.mbox_path)
if not my_file.is_file():
print("path '%s' is not a file" % args.mbox_path)
exit(0)
return mailbox.mbox(args.mbox_path)
def get_frequencies(mbox, group_by_email):
frequencies = defaultdict(lambda: 0)
for message in mbox:
full_from = message.get_from()
if group_by_email:
matches = re.findall(r'[\w.+-]+@[\w.+-]+', full_from)
key = matches[0] if len(matches) > 0 else "no email found"
else:
key = full_from
frequencies[key] += 1
return frequencies
def filter_frequencies(frequencies, threshold):
return {
key: count for key, count in frequencies.items()
if count > threshold
}
def sort_frequencies(frequencies):
# this method will return a list of 2-Tuples
return sorted(
frequencies_filtered.items(),
key=lambda kv: -kv[1]
)
if __name__ == '__main__':
args = parse_args()
mbox = open_mbox_file()
frequencies = get_frequencies(mbox, args.group_by_email)
frequencies_filtered = filter_frequencies(frequencies, args.threshold)
if len(frequencies_filtered) == 0:
print("no matches ! no single sender sent you over %s mails" % args.threshold)
exit(1)
frequencies_sorted = sort_frequencies(frequencies_filtered)
for line in frequencies_sorted:
print("%s mails from : '%s'" % (line[1], line[0]))