-
Notifications
You must be signed in to change notification settings - Fork 0
/
index
executable file
·68 lines (56 loc) · 2.1 KB
/
index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
JSON=$2
# The number of collections
NUM_COLLECTIONS=$(echo ${JSON} | jq -r '.collections | length')
# Wait for ES to be ready
while true; do
echo 'Waiting for Elasticsearch...'
STATUS=$(curl -I 'localhost:9200' 2>/dev/null | head -n 1 | cut -d$' ' -f2)
if [[ ${STATUS} == '200' ]]; then
break
fi
sleep 5
done
# For each collection...
for i in $(seq 0 $((${NUM_COLLECTIONS} - 1))); do
COLLECTION_NAME=$(echo ${JSON} | jq -r ".collections[$i].name")
COLLECTION_PATH=$(echo ${JSON} | jq -r ".collections[$i].path")
if [[ ${COLLECTION_NAME} = "core17" ]]; then
COLLECTION="NewYorkTimesCollection"
GENERATOR="JsoupGenerator"
elif [[ ${COLLECTION_NAME} = "core18" ]]; then
COLLECTION="WashingtonPostCollection"
GENERATOR="WapoGenerator"
elif [[ ${COLLECTION_NAME} = "robust04" ]]; then
COLLECTION="TrecCollection"
GENERATOR="JsoupGenerator"
elif [[ ${COLLECTION_NAME} = "gov2" ]]; then
COLLECTION="TrecwebCollection"
GENERATOR="JsoupGenerator"
elif [[ ${COLLECTION_NAME} = "cw09b" ]]; then
COLLECTION="ClueWeb09Collection"
GENERATOR="JsoupGenerator"
elif [[ ${COLLECTION_NAME} = "cw12b" ]]; then
COLLECTION="ClueWeb12Collection"
GENERATOR="JsoupGenerator"
else
echo "Unsupported collection"
exit -1
fi
# Create index
curl -XPUT 'localhost:9200/'${COLLECTION_NAME}
# Set refresh interval to 60s for the index created
curl -XPUT -H 'Content-Type: application/json' 'localhost:9200/'${COLLECTION_NAME}'/_settings' -d '{ "index": { "refresh_interval": "60s"}}'
# Indexing through Anserini (Elastirini)
sh /work/anserini/target/appassembler/bin/IndexCollection \
-collection ${COLLECTION} -generator ${GENERATOR} \
-es -es.index ${COLLECTION_NAME} \
-threads `nproc` -input ${COLLECTION_PATH} \
-storePositions -storeDocvectors -storeRawDocs
# Refresh
curl -XPOST 'localhost:9200/'${COLLECTION_NAME}'/_refresh'
done
# Stop ELK
service elasticsearch stop
service logstash stop
service kibana stop