From 949619b04f3287c908407f652d50e9ec5bd6813a Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 11 Jun 2019 15:02:22 +0100 Subject: [PATCH] fixed for readmes and robust04 --- index | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/index b/index index ace09d6..57fe5d7 100755 --- a/index +++ b/index @@ -73,7 +73,14 @@ for collection in args.json["collections"]: {0}""".format(path).split(), env=my_env) #grep out any readmes - subprocess.run(["/bin/sh", "-c", "egrep -vi (readme|dtd) /work/terrier-core/etc/collection.spec > /work/terrier-core/etc/collection.spec.new; mv /work/terrier-core/etc/collection.spec.new /work/terrier-core/etc/collection.spec"], env=my_env) + subprocess.run(["/bin/sh", "-c", "egrep -vi 'readme' /work/terrier-core/etc/collection.spec > /work/terrier-core/etc/collection.spec.new; mv /work/terrier-core/etc/collection.spec.new /work/terrier-core/etc/collection.spec"], env=my_env) + + #grep out congressional record + if name == "robust04": + subprocess.run(["/bin/sh", "-c", "egrep -vi 'cr93|read|dtd' /work/terrier-core/etc/collection.spec > /work/terrier-core/etc/collection.spec.new; mv /work/terrier-core/etc/collection.spec.new /work/terrier-core/etc/collection.spec"], env=my_env) + + print("Files to index...") + subprocess.run(["wc", "-l", "/work/terrier-core/etc/collection.spec"]) cmd=""" /work/terrier-core/bin/terrier batchindexing -p