-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_generic_download_list.py
1571 lines (1387 loc) · 95.2 KB
/
create_generic_download_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#***************************************************************************
#
# Copyright 2016, by the California Institute of Technology. ALL
# RIGHTS RESERVED. United States Government Sponsorship acknowledged.
# Any commercial use must be negotiated with the Office of Technology
# Transfer at the California Institute of Technology.
#
# @version $Id$
#
#****************************************************************************
#
# Python script to create a downlist of products from OBPG search resource http://oceandata.sci.gsfc.nasa.gov/search/file_search.cgi.
# This download list can then be fed to the downloader for the combine module.
#
# The format of the download list is:
#
# filename sha1_checksum_value
#
# with space as separator.
#
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001000000.L2_SNPP_OC.nc 466cbc5de5c8286a9723eb3c935c20aa98eabbc0
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001000000.L2_SNPP_SST.nc 9f986a23eda7263aeb4c11dfb341c886ab380b5a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001000600.L2_SNPP_OC.nc e2363a1c3db1802ac5222492210bda8f8a6ae7d3
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001000600.L2_SNPP_SST.nc afcd563a7b78731b1fce8e799a439af8a4fc5785
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001001200.L2_SNPP_OC.nc c7cbd092f179c6be700d11a32a868a9e1d3d3d61
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001001200.L2_SNPP_SST3.nc cfd5c9390e5a178acc425de95fe2e85e714dc6a2
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001001200.L2_SNPP_SST.nc 6ccd833d8453661a4e88968d61bdd33f359a691b
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001001800.L2_SNPP_OC.nc 28e24bb477934b4f79dc147f7c35c3ecb3422136
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001001800.L2_SNPP_SST3.nc e6ababb664a428916c47aa4c135045231fbd7c07
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001001800.L2_SNPP_SST.nc d3d2cf1763e29111782dc04cc1d10ca32ad07108
#
# Some notes about availability of VIIRS data:
#
# http://oceandata.sci.gsfc.nasa.gov/VIIRS/L2/
# http://oceandata.sci.gsfc.nasa.gov/VIIRS/L2/2012
#
# The VIIRS first data is available from 2012, day 002, and 353
# up to current time.
import datetime
import getopt
import os
import pathlib
import re
import requests
import sys
import time
#import urllib.request, urllib.error, urllib.parse
from generic_split_search_dates_into_months import generic_split_search_dates_into_months;
from write_final_log import write_final_log
# Make a query to OBPG to fetch a list of filename and checksum.
#
# The format of the content returned from the query if the checksum was requested look like this.
#
# 466cbc5de5c8286a9723eb3c935c20aa98eabbc0 V2016001000000.L2_SNPP_OC.nc
# 9f986a23eda7263aeb4c11dfb341c886ab380b5a V2016001000000.L2_SNPP_SST.nc
#
# Because the format of the output download list is the opposite, we have to swap the 2 columns.
#
# The following execution results in:
#
# Getting VIIRS:
#
# Just getting files that are current
#
# % python create_generic_download_list.py -l L2 -t "V20*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -g daily -b crawl_current -i output.txt -z 2 -x "/tmp/txt_file_list"
#
# Some notes:
# 1. For some strange reason the -t "V*.nc" results in zero file found. We need to add the first 2 digits of the year to become:
# -t "V20*.nc"
# 2. For the "-c crawl_current", this Python script will build the -s and -e parameters dynamically.
# Getting the first day the data is available
#
# % python create_generic_download_list.py -l L2 -t "V20*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2012-01-02" -e "2012-01-02" -g daily -x "/tmp/txt_file_list"
#
# Just SST toward the end of the day using filter and -s -e parameters:
#
# % python create_generic_download_list.py -l L2 -t "V2016001000000*SST.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2016-01-01" -e "2016-01-01" -g daily -x "/tmp/txt_file_list"
# % python create_generic_download_list.py -l L2 -t "V2015001235*SST.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g daily -x "/tmp/txt_file_list"
#
# Just SST3 files toward the end of the day using filter and -s -e parameters:
#
# % python create_generic_download_list.py -l L2 -t "V2015001235*SST3.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g daily -x "/tmp/txt_file_list"
#
# Just Ocean Color files toward the end of the day using filter and -s -e parameters:
#
# % python create_generic_download_list.py -l L2 -t "V2015001235*OC.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g daily -x "/tmp/txt_file_list"
#
# A whole day using filter (the name):
#
# % python create_generic_download_list.py -l L2 -t "V2015001*_SNPP_*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -g daily -x "/tmp/txt_file_list"
#
# A whole day using -s and -e parameters (name can be general for 2015):
#
# % python create_generic_download_list.py -l L2 -t "V2015*_SNPP_*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g daily -x "/tmp/txt_file_list"
#
# Getting all files for a whole day using filter (the name):
#
# % python create_generic_download_list.py -l L2 -t "V2015001*_SNPP*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -g daily -x "/tmp/txt_file_list"
#
# Getting all files for a whole day using -s and -e parameters (name can be general for 2015 year):
#
# % python create_generic_download_list.py -l L2 -t "V2015*_SNPP*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g daily -x "/tmp/txt_file_list"
#
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V2015001235*SST.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g hourly -x "/tmp/txt_file_list"
#
# Results in 2 files:
#
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/T2015001235000.L2_SNPP_SST.nc 63a5273e0220da71cdc91a9753907eb47d03ac83
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/T2015001235500.L2_SNPP_SST.nc b99acbb3b5f3996d860c93f07cde3632df7b9617
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V201500122*SST.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g hourly -x "/tmp/txt_file_list"
#
# Results in 10 files:
#
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220000.L2_SNPP_SST.nc 6ca5032e549ffcdf13bf77880a7bd8852dc15ea2
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220600.L2_SNPP_SST.nc 03af00d41cbe88b26a2b0e25fca2e5de3ceb5304
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221200.L2_SNPP_SST.nc aa9ae154a597ec6491fcf55507a8d6d39305d2a9
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221800.L2_SNPP_SST.nc ca629e1f570cb1b87b8f5c8ac2972cb96678a844
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001222400.L2_SNPP_SST.nc 22006616dc70ca52640e2b66481fb62bc027b08a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223000.L2_SNPP_SST.nc 6f55cd55f8dae702a988b4260ba37ba5a1828c14
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223600.L2_SNPP_SST.nc 5fe0bbdd42f9cbbdf7b703554994083876394294
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224200.L2_SNPP_SST.nc c5b755a607c72ccec0982860bdf44b59ceebfe9d
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224800.L2_SNPP_SST.nc 939bfb62f2aad9853e5ed98be6f8554738466744
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001225400.L2_SNPP_SST.nc 4302c478b95fa0c3219f062d2130b4fa2f6fad8c
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V201500122*SST3.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g hourly -x "/tmp/txt_file_list"
#
# Result in 4 files:
#
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220000.L2_SNPP_SST3.nc 426795377132c7158fc7149ec9c9f17dcb13dab4
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220600.L2_SNPP_SST3.nc 5c1520e571a0e9219c6332c8b398caa47e2ec625
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221200.L2_SNPP_SST3.nc 507f3d72b325c6c1f0425915fef364fa29e9b571
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221800.L2_SNPP_SST3.nc 9ce17bc8072ca14092bb16e76b4e4c975338b5a9
#
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V201500122*OC.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g hourly -x "/tmp/txt_file_list"
#
# Result in 8 files:
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221200.L2_SNPP_OC.nc 218fbc57af77b7636470106f23873289d444bb76
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221800.L2_SNPP_OC.nc d518d02fa62eeff61125e5752afd15d5142814f2
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001222400.L2_SNPP_OC.nc 1b06124c201a3edc6f93d832f51584d77cf68df6
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223000.L2_SNPP_OC.nc b9bc8a679bfb22b5c58bac18aa97033795c38073
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223600.L2_SNPP_OC.nc b46d1758844a6f994fce1350237786ec6d17dc8a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224200.L2_SNPP_OC.nc ab028c70d33319a686d578698b6dee6d0594c8c6
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224800.L2_SNPP_OC.nc 3eeadccd67e50239dd6761096e1a1298c4c3b10a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001225400.L2_SNPP_OC.nc 2bce3922e625da88320ba77029dfb5aa1b69c1c6
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V201500122*.nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g hourly -x "/tmp/txt_file_list"
#
# Result in 22 files (include OC, SST, SST3):
#
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220000.L2_SNPP_SST3.nc 426795377132c7158fc7149ec9c9f17dcb13dab4
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220000.L2_SNPP_SST.nc 6ca5032e549ffcdf13bf77880a7bd8852dc15ea2
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220600.L2_SNPP_SST3.nc 5c1520e571a0e9219c6332c8b398caa47e2ec625
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001220600.L2_SNPP_SST.nc 03af00d41cbe88b26a2b0e25fca2e5de3ceb5304
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221200.L2_SNPP_OC.nc 218fbc57af77b7636470106f23873289d444bb76
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221200.L2_SNPP_SST3.nc 507f3d72b325c6c1f0425915fef364fa29e9b571
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221200.L2_SNPP_SST.nc aa9ae154a597ec6491fcf55507a8d6d39305d2a9
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221800.L2_SNPP_OC.nc d518d02fa62eeff61125e5752afd15d5142814f2
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221800.L2_SNPP_SST3.nc 9ce17bc8072ca14092bb16e76b4e4c975338b5a9
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001221800.L2_SNPP_SST.nc ca629e1f570cb1b87b8f5c8ac2972cb96678a844
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001222400.L2_SNPP_OC.nc 1b06124c201a3edc6f93d832f51584d77cf68df6
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001222400.L2_SNPP_SST.nc 22006616dc70ca52640e2b66481fb62bc027b08a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223000.L2_SNPP_OC.nc b9bc8a679bfb22b5c58bac18aa97033795c38073
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223000.L2_SNPP_SST.nc 6f55cd55f8dae702a988b4260ba37ba5a1828c14
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223600.L2_SNPP_OC.nc b46d1758844a6f994fce1350237786ec6d17dc8a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001223600.L2_SNPP_SST.nc 5fe0bbdd42f9cbbdf7b703554994083876394294
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224200.L2_SNPP_OC.nc ab028c70d33319a686d578698b6dee6d0594c8c6
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224200.L2_SNPP_SST.nc c5b755a607c72ccec0982860bdf44b59ceebfe9d
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224800.L2_SNPP_OC.nc 3eeadccd67e50239dd6761096e1a1298c4c3b10a
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001224800.L2_SNPP_SST.nc 939bfb62f2aad9853e5ed98be6f8554738466744
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001225400.L2_SNPP_OC.nc 2bce3922e625da88320ba77029dfb5aa1b69c1c6
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2015001225400.L2_SNPP_SST.nc 4302c478b95fa0c3219f062d2130b4fa2f6fad8c
#
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V20150012*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g hourly -x "/tmp/txt_file_list"
#
# Result in 4 hourly files:
#
#INFO: Created file(s):
#
# ./viirs_filelist.txt.hourly_2015_001_20_date_2015_01_01 22
# ./viirs_filelist.txt.hourly_2015_001_21_date_2015_01_01 21
# ./viirs_filelist.txt.hourly_2015_001_22_date_2015_01_01 22
# ./viirs_filelist.txt.hourly_2015_001_23_date_2015_01_01 22
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2013-01-01" -e "2013-01-14" -g daily -x "/tmp/txt_file_list"
#
# Result in 14 daily files:
#
#INFO: Created file(s):
#
# ./viirs_filelist.txt.daily_2013_001_date_2013_01_01 524
# ./viirs_filelist.txt.daily_2013_002_date_2013_01_02 523
# ./viirs_filelist.txt.daily_2013_003_date_2013_01_03 524
# ./viirs_filelist.txt.daily_2013_004_date_2013_01_04 524
# ./viirs_filelist.txt.daily_2013_005_date_2013_01_05 523
# ./viirs_filelist.txt.daily_2013_006_date_2013_01_06 525
# ./viirs_filelist.txt.daily_2013_007_date_2013_01_07 523
# ./viirs_filelist.txt.daily_2013_008_date_2013_01_08 524
# ./viirs_filelist.txt.daily_2013_009_date_2013_01_09 524
# ./viirs_filelist.txt.daily_2013_010_date_2013_01_10 523
# ./viirs_filelist.txt.daily_2013_011_date_2013_01_11 524
# ./viirs_filelist.txt.daily_2013_012_date_2013_01_12 524
# ./viirs_filelist.txt.daily_2013_013_date_2013_01_13 523
# ./viirs_filelist.txt.daily_2013_014_date_2013_01_14 524
#
#INFO: all_names_found_in_execution 7332 in_files 14
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-01" -g weekly -x "/tmp/txt_file_list"
#
# Result in 1 weekly file(s):
#
# ./viirs_filelist.txt.weekly_2015_01_date_2015_01_01 525
#
#INFO: all_names_found_in_execution 525 in_files 1
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-02" -g weekly -x "/tmp/txt_file_list"
#
# Result in 1 weekly file(s):
#
# ./viirs_filelist.txt.weekly_2015_01_date_2015_01_01 1052
#
#INFO: all_names_found_in_execution 1052 in_files 1
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-01-03" -g weekly -x "/tmp/txt_file_list"
#
#INFO: Created file(s):
#
# ./viirs_filelist.txt.weekly_2015_01_date_2015_01_01 1577
#
#INFO: all_names_found_in_execution 1577 in_files 1
#
# % python create_generic_download_list.py -l L2 -t "V*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2015-01-01" -e "2015-03-01" -g weekly -x "/tmp/txt_file_list"
#
# Result in 9 weekly files:
#
#INFO: Created file(s):
#
#
# ./viirs_filelist.txt.weekly_2015_01_date_2015_01_01 3682
# ./viirs_filelist.txt.weekly_2015_02_date_2015_01_08 3673
# ./viirs_filelist.txt.weekly_2015_03_date_2015_01_15 3668
# ./viirs_filelist.txt.weekly_2015_04_date_2015_01_22 3667
# ./viirs_filelist.txt.weekly_2015_05_date_2015_01_29 3659
# ./viirs_filelist.txt.weekly_2015_06_date_2015_02_05 3665
# ./viirs_filelist.txt.weekly_2015_07_date_2015_02_12 3661
# ./viirs_filelist.txt.weekly_2015_08_date_2015_02_19 3662
# ./viirs_filelist.txt.weekly_2015_09_date_2015_02_26 2087
#
#INFO: all_names_found_in_execution 31424 in_files 9
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V2016*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2016-01-01" -e "2016-03-01" -g monthly -x "/tmp/txt_file_list"
#
# Result in 3 month files (Note that the previous command was executed on 3/1/2016 around 4:52 pm, which will have more files added before midnight):
#
#INFO: Created file(s):
#
# ./viirs_filelist.txt.monthly_2016_01 16677
# ./viirs_filelist.txt.monthly_2016_02 15753
# ./viirs_filelist.txt.monthly_2016_03 541
#
# The following execution:
#
# % python create_generic_download_list.py -l L2 -t "V2016*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc" -n viirs -d 0 -f 1 -a 1 -c 1 -s "2016-01-01" -e "2016-12-31" -g yearly -x "/tmp/txt_file_list"
#
#
#INFO: START_CRAWL: crawl_start_time 1471456394.36 2016-08-17 10:53:14.357717
#create_generic_download_list: Executing query_string http://oceandata.sci.gsfc.nasa.gov/search/file_search.cgi?dtype=L2&search=V2016*[L2_SNPP_OC,L2_SNPP_SST,L2_SNPP_SST3].nc&sensor=viirs&std_only=0&results_as_file=1&addurl=1&cksum=1&sdate=2016-01-01&edate=2016-12-31
#INFO: END_CRAWL: crawl_stop_time 1471456502.09 2016-08-17 10:55:02.090831
#INFO: END_CRAWL: duration_in_seconds 107.73
#INFO: START_SORT: sorting your list of 153899 names into alpha-numeric...
#INFO: END_SORT: duration_in_seconds 0.21
#
#INFO: Created file(s):
#
# ./viirs_filelist.txt.yearly_2016 121252
#
#INFO: all_names_found_in_execution 121252 in_files 1
#
#
#
# The flag, parameters and their meanings:
#
# -n search_dtype = Level 2, only.
# -t search_filter = Regular expression of file name.
# -n search_sensor = viirs, only
# -d search_std_only = Boolean, avoid non standard files
# -f search_as_file = Boolean, get the result as a file.
# -a search_addurl = Boolean, 1 will prepend "http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/" to the file name.
# -c checksum_flag = Boolean, 1 will result in the checksum of the file prepended to filename and a space. The file name will not have the cgi/getfile prepended
# -s search_sdate = The start date of the file in yyyy-mm-dd format.
# -e search_edate = The end date of the file in yyyy-mm-dd format.
# -g search_groupby = "" # {hourly,daily,weekly,monthly,yearly}
# -b search_current_only = Only get files that are "current" meaning files from the last 24 hours.
# -i state_filename = Name of state file to use.
# -z days_back = How many days back to search for processing start. Default is 1.
#
# Debug flag. Set to 1 if want to see debug statements.
#
# Some notes:
#
# 1. Because the server may return many names, we have to limit to some regular expression. The following setting is required for your specific regular expression.
# Example for MODIS Level 3:
#
# setenv CRAWLER_SEARCH_FILE_PATTERN "L3m|L3b"
#
# Example for VIIRS Level 2:
#
# setenv CRAWLER_SEARCH_FILE_PATTERN "L2_SNPP_SST|L2_SNPP_SST3|L2_SNPP_OC"
#
# 2. Although the -s search_sdate and -e search_edate are optional, it is recommended that they are used since this speed up the search otherwise the search take a long time.
#
# 3. To turn on the debugger, set CRAWLER_SEARCH_DEBUG_FLAG environment variable to true:
#
# C-shell method:
#
# setenv CRAWLER_SEARCH_DEBUG_FLAG true
#
# Bash method:
#
# export CRAWLER_SEARCH_DEBUG_FLAG=true
#
def main(argv):
global g_debug_flag; # Make variable global.
global g_trace_flag; # Make variable global.
g_debug_flag = 1 # Change to 1 if want to see debug prints. # NET edit.
g_trace_flag = 1 # Change to 1 if want to see trace prints. Typically used by developer to see more of the under the hood. # NET edit.
g_module_name = 'create_generic_download_list:'
os.environ["CRAWLER_SEARCH_DEBUG_FLAG"] = "false" # NET edit.
os.environ["CRAWLER_SEARCH_TRACE_FLAG"] = "false" # NET edit.
if (os.getenv("CRAWLER_SEARCH_DEBUG_FLAG") == "true"):
g_debug_flag = 1
if (os.getenv("CRAWLER_SEARCH_TRACE_FLAG") == "true"):
g_trace_flag = 1
# Define some tokens to search and replace to allow the correct ordering of the SST.nc relatively to SST3.nc granule name.
# In the natural ordering, SST3.nc comes after SST.nc but we want SST.nc to be the last. Replace SST.nc with SST9999.nc allows it to be
# after SST3.nc which is what we want the ordering to be. The code will replace back SST9999.nc back to SST.nc after the sort routine.
SST_TOKEN_UNTWEAKED_IN_REPLACE_LOGIC = "L2_SNPP_SST.nc";
SST_TOKEN_TWEAKED_IN_REPLACE_LOGIC = "L2_SNPP_SST9999.nc";
getfile_uri = "https://oceandata.sci.gsfc.nasa.gov/cgi/getfile"
search_uri = "https://oceandata.sci.gsfc.nasa.gov/api/file_search"
search_dtype = "" # L2
search_filter = "" # "V2015001235*SST.nc" (must be inside double quotes)
search_sensor = "" # viirs (This script support viirs sensor only)
search_std_only = "0" # Boolean, avoid non standard files
search_as_file = "1" # Boolean, get the result as a file.
search_addurl = "1" # Boolean, 1 will prepend "http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/" to the file name.
checksum_flag = "1" # Boolean, 1 will result in the checksum of the file prepended to filename and a space. This script will do some post processing to produce the correct output.
search_sdate = "" # "2015-01-01" (must be inside double quotes)
search_edate = "" # "2015-01-01" (must be inside double quotes)
search_groupby = "" # {hourly,daily,weekly,monthly,yearly}
search_current_only_value = "";
state_filename = ""; # Name of state file to use. Default to empty string unless override with -i <state_filename>
search_psdate = "" # "2015-01-01" (must be inside double quotes: file processing start date for search)
search_pedate = "" # "2015-01-01" (must be inside double quotes: file processing end date for search)
search_days_back = 1; # How many days back to search for file processing date start. Default is 1 to get files added the last 24 hours roughly.
# Define a list of names that this code will only look for and ignore anything else.
# Add any new patterns you want to add here separate by '|' character.
# If the user wishes to only look for _SST and _SST3, the following can be set before running this Python script
#
# setenv CRAWLER_SEARCH_FILE_PATTERN "_SST|_SST3"
default_pattern = "_SST|_SST4|_OC"; # This is the default pattern, look for SST, SST3 and OC files. _OC files may not be needed.
if (os.getenv('CRAWLER_SEARCH_FILE_PATTERN','') != ""):
default_pattern = os.getenv('CRAWLER_SEARCH_FILE_PATTERN','');
if (g_debug_flag):
print(g_module_name + "default_pattern[" + default_pattern + "]");
pattern_to_look_for = re.compile(default_pattern);
# Get the parameters from command line.
try:
opts, args = getopt.getopt(argv,"hl:t:n:d:f:a:c:s:e:o:g:b:i:z:x:")
except getopt.GetoptError:
print('python create_generic_download_list.py -l <dtype> -t <filter> -n <sensor> -d <std_only> -f <as_file> -a <addurl> -c <cksum> -s <sdate> -e <edate> -g <search_groupby> -b <search_current_only_value> -i <state_filename> -z <search_days_back> -x <path_to_text_file>"')
print('Example')
print('python create_generic_download_list.py -l l3m -t "A2016241*.nc" -n modis -d 0 -f 1 -a 1 -c 1 -s "2016-08-28" -e "2016-08-28" -g daily -x "/tmp/txt_file_list"')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('test.py -i <inputfile> -o <outputfile>')
sys.exit()
elif opt in ("-l"):
search_dtype = arg
elif opt in ("-t"):
search_filter = arg
elif opt in ("-n"):
search_sensor = arg
elif opt in ("-d"):
search_std_only = arg
elif opt in ("-f"):
search_as_file = arg
elif opt in ("-a"):
search_addurl = arg
elif opt in ("-c"):
checksum_flag = arg
elif opt in ("-s"):
search_sdate = arg
elif opt in ("-e"):
search_edate = arg
elif opt in ("-g"):
search_groupby = arg # {hourly,daily,weekly,monthly,yearly}
elif opt in ("-b"):
search_current_only_value = arg # {crawl_current}
elif opt in ("-i"):
state_filename = arg # Name of state file to use. This allow the crawling for forward stream to ignore files that it has seen.
print(state_filename)
elif opt in ("-z"):
search_days_back = int(arg) # How many days back to search for file processing date start. Default is 1
elif opt in ("-x"):
txt_file_list = arg
encountered_error_flag = False;
# For yearly search, we can provided the ability to split up the query by month so as not to time out if that particular dataset is too big..
if (search_groupby == "yearly"):
# Split the search start date and search end date parameters into individual months.
(o_search_sdates,o_search_edates) = generic_split_search_dates_into_months(search_sdate,search_edate);
print("search_sdate",search_sdate);
print("search_edate",search_edate);
print("o_search_sdates",o_search_sdates);
print("o_search_edates",o_search_edates);
print("");
processing_loop = 0;
max_loop = len(o_search_sdates);
while ((not encountered_error_flag) and (processing_loop < max_loop)):
search_sdate = o_search_sdates[processing_loop];
search_edate = o_search_edates[processing_loop];
if (os.getenv("CRAWLER_SEARCH_SKIP_ACTUAL_DOWNLOAD","") == "true"):
print("CRAWLER_SEARCH_SKIP_ACTUAL_DOWNLOAD is true. No downloading.")
print(" processing_loop",processing_loop,"max_loop",processing_loop,"search_sdate",search_sdate,"search_edate",search_edate);
else:
try:
encountered_error_flag = create_generic_download_list(search_dtype, # L2
search_filter, # "V2015001235*SST.nc" (must be inside double quotes)
search_sensor, # viirs (This script support viirs sensor only)
search_std_only, # Boolean, avoid non standard files
search_as_file, # Boolean, get the result as a file.
search_addurl, # Boolean, 1 will prepend "http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/" to the file name.
checksum_flag, # Boolean, 1 will result in the checksum of the file prepended to filename and a space. This script will do some post processing to produce the correct output.
search_sdate, # "2015-01-01" (must be inside double quotes)
search_edate, # "2015-01-01" (must be inside double quotes)
search_groupby, # {hourly,daily,weekly,monthly,yearly}
search_current_only_value,
state_filename, # Name of state file to use. Default to empty string unless override with -i <state_filename>
search_psdate, # "2015-01-01" (must be inside double quotes: file processing start date for search)
search_pedate, # "2015-01-01" (must be inside double quotes: file processing end date for search)
search_days_back, # How many days back to search for file processing date start. Default is 1 to get files added the last 24 hours roughly.
pattern_to_look_for,
txt_file_list);
except Exception as e:
write_out_error_file(str(e))
print(f"{g_module_name} - INFO: Exiting with exit code 1.")
sys.exit(1)
# end else portion of if (os.getenv("CRAWLER_SEARCH_SKIP_ACTUAL_DOWNLOAD","") == "true")
processing_loop += 1;
# end while ((not encountered_error_flag) and (processing_loop < max_loop))
# # For now, exit after 2 iterations.
# if (processing_loop >= 1):
# encountered_error_flag = True;
# end while not encountered_error_flag
else:
try:
encountered_error_flag = create_generic_download_list(search_dtype, # L2
search_filter, # "V2015001235*SST.nc" (must be inside double quotes)
search_sensor, # viirs (This script support viirs sensor only)
search_std_only, # Boolean, avoid non standard files
search_as_file, # Boolean, get the result as a file.
search_addurl, # Boolean, 1 will prepend "http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/" to the file name.
checksum_flag, # Boolean, 1 will result in the checksum of the file prepended to filename and a space. This script will do some post processing to produce the correct output.
search_sdate, # "2015-01-01" (must be inside double quotes)
search_edate, # "2015-01-01" (must be inside double quotes)
search_groupby, # {hourly,daily,weekly,monthly,yearly}
search_current_only_value,
state_filename, # Name of state file to use. Default to empty string unless override with -i <state_filename>
search_psdate, # "2015-01-01" (must be inside double quotes: file processing start date for search)
search_pedate, # "2015-01-01" (must be inside double quotes: file processing end date for search)
search_days_back, # How many days back to search for file processing date start. Default is 1 to get files added the last 24 hours roughly.
pattern_to_look_for,
txt_file_list);
except Exception as e:
write_out_error_file(str(e))
print(f"{g_module_name} - INFO: Exiting with exit code 1.")
sys.exit(1)
# Depend on if we had encountered an error or not, we exit with the appropriate code so an external program can decide what to do.
if (encountered_error_flag):
write_out_error_file("Error encountered in create_generic_download_list function.")
print(f"{g_module_name} - INFO: Exiting with exit code 1.")
sys.exit(1)
else:
sys.exit(0)
def create_generic_download_list(search_dtype, # L2
search_filter, # "V2015001235*SST.nc" (must be inside double quotes)
search_sensor, # viirs (This script support viirs sensor only)
search_std_only, # Boolean, avoid non standard files
search_as_file, # Boolean, get the result as a file.
search_addurl, # Boolean, 1 will prepend "http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/" to the file name.
checksum_flag, # Boolean, 1 will result in the checksum of the file prepended to filename and a space. This script will do some post processing to produce the correct output.
search_sdate, # "2015-01-01" (must be inside double quotes)
search_edate, # "2015-01-01" (must be inside double quotes)
search_groupby, # {hourly,daily,weekly,monthly,yearly}
search_current_only_value,
state_filename, # Name of state file to use. Default to empty string unless override with -i <state_filename>
search_psdate, # "2015-01-01" (must be inside double quotes: file processing start date for search)
search_pedate, # "2015-01-01" (must be inside double quotes: file processing end date for search)
search_days_back, # How many days back to search for file processing date start. Default is 1 to get files added the last 24 hours roughly.
pattern_to_look_for,
txt_file_list):
global g_debug_flag; # Make variable global.
global g_trace_flag; # Make variable global.
g_debug_flag = 0 # Change to 1 if want to see debug prints.
g_trace_flag = 0 # Change to 1 if want to see trace prints. Typically used by developer to see more of the under the hood.
g_module_name = 'create_generic_download_list:'
if (os.getenv("CRAWLER_SEARCH_DEBUG_FLAG") == "true"):
g_debug_flag = 1
if (os.getenv("CRAWLER_SEARCH_TRACE_FLAG") == "true"):
g_trace_flag = 1
o_encountered_error_flag = False;
# Define some tokens to search and replace to allow the correct ordering of the SST.nc relatively to SST3.nc granule name.
# In the natural ordering, SST3.nc comes after SST.nc but we want SST.nc to be the last. Replace SST.nc with SST9999.nc allows it to be
# after SST3.nc which is what we want the ordering to be. The code will replace back SST9999.nc back to SST.nc after the sort routine.
SST_TOKEN_UNTWEAKED_IN_REPLACE_LOGIC = "L2_SNPP_SST.nc";
SST_TOKEN_TWEAKED_IN_REPLACE_LOGIC = "L2_SNPP_SST9999.nc";
getfile_uri = "https://oceandata.sci.gsfc.nasa.gov/cgi/getfile"
search_uri = "https://oceandata.sci.gsfc.nasa.gov/api/file_search"
# Special processing if the value of search_current_only_value is "get_current_files_only"
g_state_dictionary = {}; # Define a dictionary so we can save all files' state to this dictionary. A state of a file is a name plus checksum -> http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/V2016001000000.L2_SNPP_OC.nc 466cbc5de5c8286a9723eb3c935c20aa98eabbc0
g_state_for_saving_dictionary = {}; # Define a dictionary so we can save the current state. This allows us to have a state of only recent files. It doesn't carry over.
g_use_state_flag = True; # NET edit.
g_num_names_added_to_state = 0;
g_num_names_replaced_in_state = 0;
g_num_existing_names_same_checksum_in_state = 0;
g_num_existing_names_different_checksum_in_state = 0;
g_num_names_found_from_crawling = 0;# A raw number of names found from crawling.
g_num_names_matching_pattern = 0; # Number of file names matching pattern_to_look_for object.
g_num_names_dissimilar_pattern = 0; # Number of file names different than pattern_to_look_for object.
g_num_file_states_loaded = 0;
default_state_filename = "";
default_state_filename = state_filename # NET edit.
# The crawl_current has special processing:
# 1. This crawling allows the crawler to pick up files that has been processed within a window of time.
# 2. If a new files has been added since last crawl, only the new file names will be returned.
# 3. It uses a state file to keep track of file names it has seen before.
if (search_current_only_value == "crawl_current"):
g_use_state_flag = True;
if (g_debug_flag):
print(g_module_name + "search_days_back",search_days_back)
window_start_date = datetime.datetime.now() - datetime.timedelta(days=search_days_back); # The window start is that many days back.
time_search_option = "use_date_when_file_processed"; # TIME_SEARCH_OPTION_1 Use the processing date of the file to search.
# time_search_option = "use_date_in_filename"; # TIME_SEARCH_OPTION_2 Use the date in the file name to search.
# TIME_SEARCH_OPTION_1: To make sure we get files that has been processed within a window, we only search for files that have been processed since window start to now.
# By default, the window start is one day ago, unless the -z option is specified then we get files that has been processed that many days back.
if (time_search_option == "use_date_when_file_processed"):
from datetime import date;
search_psdate = window_start_date.strftime("%Y-%m-%d");
#search_pedate = time.strftime("%Y-%m-%d");
# For some strange reason, using today's date does not return any files.
# Try adding 1 day to today's date so it will tomorrow.
weird_end_date = date.today() + datetime.timedelta(days=2);
search_edate = weird_end_date.strftime("%Y-%m-%d");
print(g_module_name + "search_psdate["+search_psdate+"]")
print(g_module_name + "search_edate["+search_edate+"]")
print(g_module_name + "time_search_option["+time_search_option+"]")
#exit(0);
# TIME_SEARCH_OPTION_2: To make sure we get files that has time from the specified start date, we only search for files from the specified start date to midnight today.
if (time_search_option == "use_date_in_filename"):
# If the user has not specify the -s <sdate> we get it from the window start.
if (search_sdate == ""):
search_sdate = window_start_date.strftime("%Y-%m-%d");
# If the user has not specify the -e <edate> we get it from the today's date.
if (search_edate == ""):
search_edate = time.strftime("%Y-%m-%d");
default_state_filename = "viirs_daily_current_state.txt";
if (state_filename != ""):
default_state_filename = state_filename;
if (g_debug_flag):
print(g_module_name + "search_current_only_value",search_current_only_value);
print(g_module_name + "search_sdate",search_sdate);
print(g_module_name + "search_edate",search_edate);
print(g_module_name + "search_psdate",search_psdate);
print(g_module_name + "search_pedate",search_pedate);
print(g_module_name + "time_search_option",time_search_option);
print(g_module_name + "state_filename",state_filename);
print(g_module_name + "default_state_filename",default_state_filename);
# sys.exit(0);
# Load the content of the default state file into a dictionary.
if os.path.isfile(default_state_filename):
if (g_debug_flag):
print(g_module_name + "loading default_state_filename",default_state_filename)
# g_debug_flag = 1;
with open(default_state_filename) as input_file_fp:
for line in input_file_fp:
g_num_file_states_loaded += 1;
(key, val) = line.rstrip().split();
g_state_dictionary[key] = val; # Read from state file into g_state_dictionary so we can have something to check for a file when we see it.
if (g_num_file_states_loaded != len(g_state_dictionary)):
print("ERROR:default_state_filename",default_state_filename,"g_num_file_states_loaded",g_num_file_states_loaded,"len(g_state_dictionary)",len(g_state_dictionary),"two_values_differ");
input_file_fp.close();
write_out_error_file(f"default_state_filename {default_state_filename} g_num_file_states_loaded {g_num_file_states_loaded} {len(g_state_dictionary)} {len(g_state_dictionary)} two_values_differ")
sys.exit(1);
if (g_trace_flag):
print("line",line.rstrip());
print("key",key,"val",val)
input_file_fp.close();
if (g_debug_flag):
print(g_module_name + "loaded default_state_filename",default_state_filename,"g_num_file_states_loaded",g_num_file_states_loaded,"len(g_state_dictionary)",len(g_state_dictionary));
else:
print(g_module_name + "WARN:FILE_NOT_EXIST:default_state_filename",default_state_filename,"does not exist yet. Starting with zero state.");
# sys.exit(0);
if (g_debug_flag):
print(g_module_name + "search_sdate",search_sdate);
print(g_module_name + "search_edate",search_edate);
# exit(0);
# Do a sanity check on all parameters.
if (search_dtype == ""):
print('ERROR: Must specify option: -l <dtype>')
print('python create_generic_download_list.py -l <dtype> -t <filter> -n <sensor> -d <std_only> -f <as_file> -a <addurl> -c <cksum> -s <sdate> -e <edate> -g <search_groupby>')
print('Example: -l L2')
sys.exit(2)
if (search_filter == ""):
print('ERROR: Must specify option: -t <filter>')
print('python create_generic_download_list.py -l <dtype> -t <filter> -n <sensor> -d <std_only> -f <as_file> -a <addurl> -c <cksum> -s <sdate> -e <edate> -g <search_groupby>')
print('Example: -t "T2015001235*SST.nc"')
sys.exit(2)
if (search_sensor == ""):
print('ERROR: Must specify option: -n <sensor>')
print('python create_generic_download_list.py -l <dtype> -t <filter> -n <sensor> -d <std_only> -f <as_file> -a <addurl> -c <cksum> -s <sdate> -e <edate> -g <search_groupby>')
print('Example: -n viirs')
sys.exit(2)
valid_groupby_list = ["hourly","daily","weekly","monthly","yearly"];
if (search_groupby not in valid_groupby_list):
print('ERROR: Must specify a valid value for -g option:')
print('Valid options are',valid_groupby_list);
print('You specified',search_groupby);
sys.exit(2)
# Do a sanity check on the search_filter to see that it at least contain the year
# For some strange reason, the search query behaves badly when you don't give it a year, i.e.
# it will not give you any result and you will be frustrated wondering why this thing won't work.
# validate_search_filter(search_filter);
# Check to see if search_sdate and search_edate are provided and add them to the query_string.
query_string = ""
# Because we added the ability to search for files that has been processed within a window, the logic gets a little tricky
# as you have to keep track of these four parameters while building the query_string variable:
#
# the start window of file granule start time
# the end window of file granule start time
# the start window of a file processing time
# the end window of a file processing time
if (search_sdate != ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_SDATE_NOT_EMPTY");
if (search_edate != ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_EDATE_NOT_EMPTY");
if (search_psdate != ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_PSDATE_NOT_EMPTY");
if (search_pedate != ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_PSDATE_NOT_EMPTY:SEARCH_PEDATE_NOT_EMPTY");
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag + "&sdate=" + search_sdate + "&edate=" + search_edate + "&psdate=" + search_psdate + "&pedate" + search_pedate
else: # search_pedate is empty string.
if (g_debug_flag):
print(g_module_name + "SEARCH_PSDATE_NOT_EMPTY:SEARCH_PEDATE NOT EMPTY");
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag + "&sdate=" + search_sdate + "&edate=" + search_edate + "&psdate=" + search_psdate;
else: # search_psdate is empty string.
if (g_debug_flag):
print(g_module_name + "SEARCH_EDATE_NOT_EMPTY:SEARCH_PSDATE_EMPTY");
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag + "&sdate=" + search_sdate + "&edate=" + search_edate;
else: # search_edate is empty string
if (g_debug_flag):
print(g_module_name + "SEARCH_SDATE_NOT_EMPTY:SEARCH_EDATE_EMPTY");
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag + "&sdate=" + search_sdate;
else: # search_sdate is empty string.
# If it acceptable to have the search_sdate (meaning that the user does not care about when the granule started), but they should specified the search_filter to something.
if (g_debug_flag):
print(g_module_name + "SEARCH_SDATE_EMPTY");
if (search_psdate != ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_PSDATE_NOT_EMPTY");
if (search_pedate != ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_SDATE_EMPTY:SEARCH_PSDATE_NOT_EMPTY:SEARCH_PEDATE NOT EMPTY");
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag + "&psdate=" + search_psdate + "&pedate=" + search_pedate;
else:
if (g_debug_flag):
print(g_module_name + "SEARCH_SDATE_EMPTY:SEARCH_PSDATE_NOT_EMPTY:SEARCH_PEDATE_EMPTY");
#print(g_module_name + "search_uri",search_uri,type(search_uri))
#print(g_module_name + "search_dtype",search_dtype,type(search_uri))
#print(g_module_name + "search_filter",search_filter,type(search_filter))
#print(g_module_name + "search_sensor",search_sensor,type(search_sensor))
#print(g_module_name + "search_std_only",search_std_only,type(search_std_only))
#print(g_module_name + "search_as_file",search_as_file,type(search_as_file))
#print(g_module_name + "search_addurl",search_addurl,type(search_addurl))
#print(g_module_name + "checksum_flag",checksum_flag,type(checksum_flag))
#print(g_module_name + "search_psdate",search_psdate,type(search_psdate))
#print("search_uri='"+str(search_uri)+"'")
#print("search_dtype='"+str(search_dtype)+"'")
#print("search_filter='"+str(search_filter)+"'")
#print("search_sensor='"+str(search_sensor)+"'")
#print("search_std_only='"+str(search_std_only)+"'")
#print("search_as_file='"+str(search_as_file)+"'")
#print("search_addurl='"+str(search_addurl)+"'")
#print("checksum_flag='"+str(checksum_flag)+"'")
#print("search_psdate='"+str(search_psdate)+"'")
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag + "&psdate=" + search_psdate;
else: # if (search_psdate == ""):
if (g_debug_flag):
print(g_module_name + "SEARCH_SDATE_EMPTY:SEARCH_PSDATE_EMPTY");
query_string = search_uri + "?" + "dtype=" + search_dtype + "&search=" + search_filter + "&sensor=" + search_sensor + "&std_only=" + search_std_only + "&results_as_file=" + search_as_file + "&addurl=" + search_addurl + "&cksum=" + checksum_flag;
if (g_debug_flag):
print(g_module_name + "query_string [" + query_string + "]")
# Get the default output directory if provided, otherwise, use the current directory.
CRAWLER_DEFAULT_OUTPUT_DIRECTORY = "./";
if (os.getenv("CRAWLER_SEARCH_DEFAULT_OUTPUT_DIRECTORY","") != ""):
if (os.getenv("CRAWLER_SEARCH_DEFAULT_OUTPUT_DIRECTORY","").endswith("/")):
CRAWLER_DEFAULT_OUTPUT_DIRECTORY = os.getenv("CRAWLER_SEARCH_DEFAULT_OUTPUT_DIRECTORY");
else:
CRAWLER_DEFAULT_OUTPUT_DIRECTORY = os.getenv("CRAWLER_SEARCH_DEFAULT_OUTPUT_DIRECTORY") + "/";
# Check if directory exists. If not, create it.
if (not os.path.isdir(CRAWLER_DEFAULT_OUTPUT_DIRECTORY)):
print(g_module_name, 'DIR_CREATE',CRAWLER_DEFAULT_OUTPUT_DIRECTORY);
os.mkdir(CRAWLER_DEFAULT_OUTPUT_DIRECTORY);
# Fetch the content from the query.
if (g_debug_flag):
print('os.getenv("CRAWLER_SEARCH_TEST_MODE")[',os.getenv("CRAWLER_SEARCH_TEST_MODE",""),']');
crawl_start_time = time.time();
print(g_module_name + 'INFO:START_CRAWL:crawl_start_time',crawl_start_time, str(datetime.datetime.now()));
# g_trace_flag = 1;
if (os.getenv("CRAWLER_SEARCH_TEST_MODE") == "true"):
# For developer only: Since we are running a test, we want to set these values as hard-coded.
# The content is a large string with carriage returns.
print('os.getenv("CRAWLER_SEARCH_TEST_MODE")[',os.getenv("CRAWLER_SEARCH_TEST_MODE",""),']');
content_raw = \
'6aadb70909b95a0ba139b2ece0240bee1e9531a1 V2015001200000.L2_SNPP_SST.nc\n' + \
'd91ab88270cf8dbff745ad0f96da05427fed56fe V2015001200500.L2_SNPP_SST.nc\n' + \
'953ea181edcd8bfdbcf9402b68be46b37054fd69 V2015001201000.L2_SNPP_SST.nc\n' + \
'8064d036c26a6a0cfc67a7eb694148c7fe248d52 V2015001201500.L2_SNPP_SST.nc\n' + \
'1a699df687f625bc6ab0a1e0f990898626581cf0 V2015001202000.L2_SNPP_SST.nc\n' + \
'f5bcbb45e6a25749969d2ba985027f83e34490f1 V2015001202500.L2_SNPP_SST.nc\n' + \
'25e37084e51bb1bac49920eee39550b0b3bb262f V2015001203000.L2_SNPP_SST.nc\n' + \
'a77eb9e2274dc43a4a7d06d9ac7114b74ad40c91 V2015001203500.L2_SNPP_SST.nc\n' + \
'94bcd3d5ee88fdc9f27ec8646bc04201351a08b4 V2015001204000.L2_SNPP_SST.nc\n' + \
'2c8c94f7d37eb36d70e2ada526460e88604c5657 V2015001204000.L2_SNPP_SST3.nc\n' + \
'ffd75043967a6dd9f9400ed84c6d859a8cc04660 V2015001204500.L2_SNPP_SST.nc\n' + \
'8a61a0df1c9af563603b364020731a562ce7a689 V2015001204500.L2_SNPP_SST3.nc\n' + \
'd69d4d93e474ae6d2421998b78116fa305fe2679 V2015001205000.L2_SNPP_SST.nc\n' + \
'988068a80e3dfc8e29772937529414f5c72229ad V2015001205000.L2_SNPP_SST3.nc\n' + \
'ce807d2aa9fffd11f0a6a4feb6301c045564aee9 V2015001205500.L2_SNPP_SST.nc\n' + \
'0674ea50c697834d4dc8a857b334728fa28267b4 V2015001205500.L2_SNPP_SST3.nc\n' + \
'zzz4ea50c697834d4dc8a857b334728fa2826zzz V2015001210000.L2_SNPP_SST.nc\n' + \
'zzz4ea50c697834d4dc8a857b334728fa2826zzz V2016001210000.L2_SNPP_SST.nc\n';
else:
# The content returned from the read() function is a large string with carriage return.
# Search by creation date if OBPG_CREATION_DATE environment variable is set - this prevents the inclusion of files that have been modified
creation_date = int(os.getenv("CREATION_DATE_SEARCH"))
if creation_date: query_string += "&crdate=1"
print(g_module_name + 'INFO:Executing query_string', query_string);
#exit(0);
content_raw = requests.get(query_string)
crawl_stop_time= time.time();
print(g_module_name + 'INFO:END_CRAWL:crawl_stop_time',crawl_stop_time,str(datetime.datetime.now()));
print(g_module_name + 'INFO:END_CRAWL:duration_in_seconds %.2f' % (crawl_stop_time - crawl_start_time));
time.sleep(3) # Sleep for 3 seconds so the user can see how long the query took.
if (g_trace_flag):
print(g_module_name + "TRACE:content_raw [" + content_raw.text + "]") # NET Edit.
print(g_module_name + "query_string [" + query_string + "]")
# sys.exit(0);
content_as_list_unsorted = content_raw.text.split('\n')
# Because OBPG does not have these names in any order we can depend on, we have to sort them.
# Also, the list starts with the checksum, then the name:
#
# 02cd518f8f2e9cfbd2ec4a0fc1cadcda6ae0c318 V2016016120000.L2_SNPP_SST3.nc
# 011d8da0d15f080098d75efc9a9680056ae989b4 V2016010043500.L2_SNPP_SST3.nc
# f282221294648c47049e2478001c582388d83e87 V2016013101000.L2_SNPP_SST.nc
# 9e9064a8c1566319cf6609f008b73a20a2c2d5f5 V2016018022000.L2_SNPP_SST.nc
# e5e4238636d033fefcc09ecc79cb7110015e9e8a V2016018205500.L2_SNPP_SST.nc
#
# We have to swap the columns, sort them using the file name, then write
# Split each line, into checkum and file name components, then save them into a dictionary.
new_content_with_names_and_checksum_switched = [];
# Only process if the line contains the regular expression.
regular_expression_to_check = os.getenv("CRAWLER_SEARCH_FILE_PATTERN","");
for one_line in content_as_list_unsorted:
if (g_debug_flag):
print(g_module_name + "regular_expression_to_check",regular_expression_to_check);
print(g_module_name + "one_line[",one_line,"]");
print(g_module_name + "bool(re.search(regular_expression_to_check, one_line))",bool(re.search(regular_expression_to_check, one_line)));
if (regular_expression_to_check == ""):
print(g_module_name + "Nothing to check for from CRAWLER_SEARCH_FILE_PATTERN");
sys.exit(0);
if bool(re.search(regular_expression_to_check, one_line)):
# Parse the line into 2 tokens.
tokens = re.findall(r'[^"\s]\S*|".+?"', one_line)
if (len(tokens) >= 2):
checksum_part = tokens[0];
filename_part = tokens[1];
# Because in the natural order of the list of files below, the SST comes before SST3:
#
# A2013001232000.L2_SNPP_OC.nc
# A2013001232000.L2_SNPP_SST.nc
# A2013001232000.L2_SNPP_SST3.nc
#
# But we wish the SST to be the last name, so we tweak the L2_SNPP_SST.nc to L2_SNPP_SST9999 so that it is last.
tweaked_filename = filename_part;
if (filename_part.find(SST_TOKEN_UNTWEAKED_IN_REPLACE_LOGIC) >= 0):
tweaked_filename = filename_part.replace(SST_TOKEN_UNTWEAKED_IN_REPLACE_LOGIC,SST_TOKEN_TWEAKED_IN_REPLACE_LOGIC);
new_line = tweaked_filename + " " + checksum_part
if (g_trace_flag):
print(g_module_name + 'Adding:new_content_with_names_and_checksum_switched:',new_line);
new_content_with_names_and_checksum_switched.append(new_line);
# end if (len(tokens) >= 2):
# end bool(re.search(regular_expression_to_check, one_line))
# end for one_line in content_as_list_unsorted:
if (g_trace_flag):
print(g_module_name + 'new_content_with_names_and_checksum_switched',new_content_with_names_and_checksum_switched);
print(g_module_name + 'INFO:START_SORT:sorting your list of',len(new_content_with_names_and_checksum_switched),'names into alpha-numeric...');
sort_start_time = time.time();
content_as_list_sorted_but_with_tweaked_names = sorted(new_content_with_names_and_checksum_switched);
sort_stop_time= time.time();
print(g_module_name + 'INFO:END_SORT:duration_in_seconds %.2f' % (sort_stop_time - sort_start_time));
print(g_module_name + 'content_as_list_sorted_but_with_tweaked_names ');
# sys.exit(0);
# Now that the list has been sorted, we have to replace the tweaked names back to the untweak name, i.e. replace A2013001231500.L2_SNPP_SST9999.nc with A2013001231500.L2_SNPP_SST.nc
content_as_list_sorted = [];
for one_line in content_as_list_sorted_but_with_tweaked_names:
# But we wish the SST to be the last name, we tweak the L2_SNPP_SST.nc to L2_SNPP_SST9999 so that it is last.
original_line = one_line;
if (one_line.find(SST_TOKEN_TWEAKED_IN_REPLACE_LOGIC) >= 0):
original_line = one_line.replace(SST_TOKEN_TWEAKED_IN_REPLACE_LOGIC,SST_TOKEN_UNTWEAKED_IN_REPLACE_LOGIC);
if (g_trace_flag):
print(g_module_name + 'Adding:content_as_list_sorted:',original_line);
content_as_list_sorted.append(original_line);
# sys.exit(0);
# For each line in the list, parse the tokens, swap the first and second column and write the tokens back out to the download list.
found_names = 0;
all_names_found_in_execution = 0;
first_name_found_flag = False;
# We start with a default file name and then it will be appended with specific verbase as to which type of file it is: yearly, monthly, weekly, daily, hourly, etc...
output_file_pointer = 'DUMMY_OUTPUT_FILE_POINTER';
previous_output_file_name = '';
# The aqua or terra sensor has "modis_" preceed in the file name to keep consistency with the modis-rdac handler.
if (search_sensor == "aqua" or search_sensor == "terra"):
output_file_name = CRAWLER_DEFAULT_OUTPUT_DIRECTORY + "modis_" + search_sensor + '_filelist.txt';
else:
output_file_name = CRAWLER_DEFAULT_OUTPUT_DIRECTORY + search_sensor + '_filelist.txt';
BASE_OUTPUT_FILE_NAME = output_file_name;
# if (search_sensor == 'modis'):
# output_file_name = CRAWLER_DEFAULT_OUTPUT_DIRECTORY + 'viirs_filelist.txt';
# BASE_OUTPUT_FILE_NAME = output_file_name;
# else:
# print "This script only support sensor modis";
# sys.exit(0);
# print "BASE_OUTPUT_FILE_NAME",BASE_OUTPUT_FILE_NAME;
# sys.exit(0);
# Use this list to save the list of output file names so we can write them out to user.
list_of_output_file_names = [];
current_year = 'DUMMY_CURRENT_YEAR';
previous_year = 'DUMMY_PREVIOUS_YEAR';
current_month = 'DUMMY_CURRENT_MONTH';
previous_month = 'DUMMY_PREVIOUS_MONTH';
current_week = 'DUMMY_CURRENT_WEEK';
previous_week = 'DUMMY_PREVIOUS_WEEK';
current_day_of_year = 'DUMMY_CURRENT_DAY_OF_YEAR';
previous_day_of_year = 'DUMMY_PREVIOUS_DAY_OF_YEAR';
current_hour = 'DUMMY_CURRENT_HOUR';
previous_hour = 'DUMMY_PREVIOUS_HOUR';
# As each file is inspected against the state file, we determine if we have seen this file before.
# If the file has not been seen before, we set the state to "ready_for_saving"
# If the file has been seen before:
# but the checksum has been modified, we set the state to "ready_for_saving".
# If the checksum is the same, we set the ste to "seen_before_with_same_checksum"
# Only the file with the state "ready_for_saving" will be written to file.
file_state_status = "";
g_num_added_to_saving_dictionary = 0;
# Only process if the line contains the regular expression.
regular_expression_to_check = os.getenv("CRAWLER_SEARCH_FILE_PATTERN","");
for one_line in content_as_list_sorted:
file_state_status = "undefined_state"; # Start out with an undefined_state and will be either set to "ready_for_saving" or "seen_before_with_same_checksum";
# Each line look like this and we will need to swap the columns and prepend 'http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/' to the file name.