-
Notifications
You must be signed in to change notification settings - Fork 0
/
uniq-in-order
executable file
·89 lines (71 loc) · 3.05 KB
/
uniq-in-order
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
# This can be achieved by a one-liner:
# perl -ne '$H{$_}++ or print'
"""
Filter even non-adjacent matching lines from input, writing to output.
It is a compatible replacement for uniq, but supports only a small subset of
options and functionality. It is ideal for replacing `sort | uniq` pipelines
where the order of the original lines is important.
"""
__version__ = '1.0'
import argparse
import sys
from collections import deque
from collections import OrderedDict
def stream_uniq(f_in, ignore_case=False, repeated_only=False, print_counts=False):
counts = OrderedDict()
while True:
# use .readline() rather than `for line in f_in:` due to buffering
line = f_in.readline()
if not line:
break
line = line.rstrip('\r\n')
compare = line.lower() if ignore_case else line
counts[compare] = counts.get(compare, 0) + 1
# if printing counts, wait until end to print
if print_counts:
continue
if repeated_only:
# repeated: print only on 2nd occurrence
if counts[compare] == 2:
yield line
else:
# unique: print only on 1st occurrence
if counts[compare] == 1:
yield line
# if printing counts, print all at end
if print_counts:
for line, count in counts.iteritems():
if repeated_only and count == 1:
continue
yield (count, line)
def print_uniq(f_in, f_out, ignore_case=False, repeated_only=False,
print_counts=False):
for item in stream_uniq(f_in=f_in, ignore_case=ignore_case,
repeated_only=repeated_only,
print_counts=print_counts):
if print_counts:
count, line = item
f_out.write(('%4d ' % count) + line + '\n')
else:
f_out.write(item + '\n')
if __name__ == '__main__':
formatter = argparse.RawDescriptionHelpFormatter
p = argparse.ArgumentParser(usage='%(prog)s [options] [input [output]]',
description=__doc__, formatter_class=formatter)
p.add_argument('-c', '--count', action='store_true',
help='prefix lines by the number of occurrences')
p.add_argument('-d', '--repeated', action='store_true',
help='only print duplicate lines')
p.add_argument('-i', '--ignore-case', action='store_true',
help='ignore differences in case when comparing')
p.add_argument('input', nargs='?', type=argparse.FileType('r'),
default=sys.stdin, help='default: stdin')
p.add_argument('output', nargs='?', type=argparse.FileType('w'),
default=sys.stdout, help='default: stdout')
p.add_argument('-v', '--version', action='version',
version='%(prog)s ' + __version__)
args = p.parse_args()
print_uniq(f_in=args.input, f_out=args.output,
ignore_case=args.ignore_case, repeated_only=args.repeated,
print_counts=args.count)