forked from biothings/pending.api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config_hub.py
219 lines (183 loc) · 8.43 KB
/
config_hub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# ######### #
# HUB VARS #
# ######### #
from biothings.utils.configuration import ConfigurationError
from biothings.utils.loggers import setup_default_log
import logging
import os
DATA_HUB_DB_DATABASE = "biothings_hubdb" # db containing the following (internal use)
DATA_SRC_MASTER_COLLECTION = 'src_master' # for metadata of each src collections
DATA_SRC_DUMP_COLLECTION = 'src_dump' # for src data download information
DATA_SRC_BUILD_COLLECTION = 'src_build' # for src data build information
DATA_SRC_BUILD_CONFIG_COLLECTION = 'src_build_config'
DATA_PLUGIN_COLLECTION = 'data_plugin' # for data plugins information
API_COLLECTION = 'api' # for api information (running under hub control)
CMD_COLLECTION = 'cmd' # for cmd launched from the hub
EVENT_COLLECTION = 'event' # for event propagation
DATA_TARGET_MASTER_COLLECTION = 'db_master'
# Redis config to cache IDs when doing cold/hot merge
REDIS_CONNECTION_PARAMS = {}
# where to store info about processes launched by the hub
RUN_DIR = '/tmp/run'
# reporting diff results, number of IDs to consider (to avoid too much mem usage)
MAX_REPORTED_IDS = 1000
# for diff updates, number of IDs randomly picked as examples when rendering the report
MAX_RANDOMLY_PICKED = 10
# size in bytes for a diff file (used in diff/reduce step)
MAX_DIFF_SIZE = 10 * 1024**2
# ES s3 repository to use snapshot/restore (must be pre-configured in ES)
SNAPSHOT_REPOSITORY = "pending_repository"
# cache file format ("": ascii/text uncompressed, or "gz|zip|xz"
CACHE_FORMAT = "xz"
# How much memory hub is allowed to use:
# - "auto", let hub decides (will use 50%-60% of available RAM)
# - None: no limit
# - otherwise specify a number in bytes
HUB_MAX_MEM_USAGE = None
# Max number of *processes* hub can access to run jobs
HUB_MAX_WORKERS = max(1, int(os.cpu_count() / 4))
# Max number of *threads* hub can use (will default to HUB_MAX_WORKERS if undefined)
HUB_MAX_THREADS = HUB_MAX_WORKERS
MAX_SYNC_WORKERS = HUB_MAX_WORKERS
# Max queued jobs in job manager
# this shouldn't be 0 to make sure a job is pending and ready to be processed
# at any time (avoiding job submission preparation) but also not a huge number
# as any pending job will consume some memory).
MAX_QUEUED_JOBS = os.cpu_count() * 4
# when creating a snapshot, how long should we wait before querying ES
# to check snapshot status/completion ? (in seconds)
# Since myvariant's indices are pretty big, a whole snaphost won't happen in few secs,
# let's just monitor the status every 5min
MONITOR_SNAPSHOT_DELAY = 5 * 60
# Hub environment (like, prod, dev, ...)
# Used to generate remote metadata file, like "latest.json", "versions.json"
# If non-empty, this constant will be used to generate those url, as a prefix
# with "-" between. So, if "dev", we'll have "dev-latest.json", etc...
# "" means production
HUB_ENV = ""
# Pre-prod/test ES definitions
INDEX_CONFIG = {
# "build_config_key" : None, # used to select proper idxr/syncer
"indexer_select": {
# default
# None : "path.to.special.Indexer",
},
"env": {
"test": {
"host": "localhost:9200",
"indexer": {
"args": {
"timeout": 300,
"retry_on_timeout": True,
"max_retries": 10,
},
},
"index": [],
}
},
}
# Snapshot environment configuration
SNAPSHOT_CONFIG = {}
RELEASE_CONFIG = {}
# SSH port for hub console
HUB_SSH_PORT = 19022
HUB_API_PORT = 19080
READONLY_HUB_API_PORT = 19081
# Hub name/icon url/version, for display purpose
HUB_NAME = "Pending (backend)"
HUB_ICON = "http://biothings.io/static/img/sdk-icon.svg"
HUB_VERSION = "master"
USE_RELOADER = True # so no need to restart hub when a datasource has changed
################################################################################
# HUB_PASSWD
################################################################################
# The format is a dictionary of 'username': 'cryptedpassword'
# Generate crypted passwords with 'openssl passwd -crypt'
HUB_PASSWD = {"guest": "9RKfd8gDuNf0Q"}
# cached data (it None, caches won't be used at all)
CACHE_FOLDER = None
########################################
# APP-SPECIFIC CONFIGURATION VARIABLES #
########################################
# The following variables should or must be defined in your
# own application. Create a config.py file, import that config_common
# file as:
#
# from config_hub import *
#
# then define the following variables to fit your needs. You can also override any
# any other variables in this file as required. Variables defined as ValueError() exceptions
# *must* be defined
#
# Individual source database connection
DATA_SRC_SERVER = ConfigurationError("Define hostname for source database")
DATA_SRC_PORT = ConfigurationError("Define port for source database")
DATA_SRC_DATABASE = ConfigurationError("Define name for source database")
DATA_SRC_SERVER_USERNAME = ConfigurationError(
"Define username for source database connection (or None if not needed)")
DATA_SRC_SERVER_PASSWORD = ConfigurationError(
"Define password for source database connection (or None if not needed)")
# Target (merged collection) database connection
DATA_TARGET_SERVER = ConfigurationError("Define hostname for target database (merged collections)")
DATA_TARGET_PORT = ConfigurationError("Define port for target database (merged collections)")
DATA_TARGET_DATABASE = ConfigurationError("Define name for target database (merged collections)")
DATA_TARGET_SERVER_USERNAME = ConfigurationError(
"Define username for target database connection (or None if not needed)")
DATA_TARGET_SERVER_PASSWORD = ConfigurationError(
"Define password for target database connection (or None if not needed)")
HUB_DB_BACKEND = ConfigurationError("Define Hub DB connection")
# Internal backend. Default to mongodb
# For now, other options are: mongodb, sqlite3, elasticsearch
# HUB_DB_BACKEND = {
# "module" : "biothings.utils.sqlite3",
# "sqlite_db_foder" : "./db",
# }
# HUB_DB_BACKEND = {
# "module" : "biothings.utils.mongo",
# "uri" : "mongodb://localhost:27017",
# #"uri" : "mongodb://user:passwd@localhost:27017", # mongodb std URI
# }
# HUB_DB_BACKEND = {
# "module" : "biothings.utils.es",
# "host" : "localhost:9200",
# }
#ES_HOST = ConfigurationError("Define ElasticSearch host used for index creation (eg localhost:9200)")
TORNADO_SETTINGS = {
# max 10GiB upload
"max_buffer_size": 10*1024*1024*1024,
}
# Path to a folder to store all downloaded files, logs, caches, etc...
DATA_ARCHIVE_ROOT = ConfigurationError(
"Define path to folder which will contain all downloaded data, cache files, etc...")
# Path to a folder to store all 3rd party parsers, dumpers, etc...
DATA_PLUGIN_FOLDER = ConfigurationError(
"Define path to folder which will contain all 3rd party parsers, dumpers, etc...")
DATA_UPLOAD_FOLDER = ConfigurationError("Define path to folder where uploads to API are stored")
# Path to folder containing diff files
DIFF_PATH = ConfigurationError("Define path to folder which will contain output files from diff")
# Usually inside DATA_ARCHIVE_ROOT
#DIFF_PATH = os.path.join(DATA_ARCHIVE_ROOT,"diff")
# Path to folder containing release note files
RELEASE_PATH = ConfigurationError("Define path to folder which will contain release files")
# Usually inside DATA_ARCHIVE_ROOT
#RELEASE_PATH = os.path.join(DATA_ARCHIVE_ROOT,"release")
# this dir must be created manually
LOG_FOLDER = ConfigurationError("Define path to folder which will contain log files")
# Usually inside DATA_ARCHIVE_ROOT
#LOG_FOLDER = os.path.join(DATA_ARCHIVE_ROOT,'logs')
# When ES repository type is "fs", where snapshot should be stored
ES_BACKUPS_FOLDER = ConfigurationError(
"Define path to folder which will contain ES snapshot when type='fs'")
# List of versions.json URLs, Hub will handle these as sources for data releases
VERSION_URLS = []
# default hub logger
logger = ConfigurationError(
"Provider a default hub logger instance (use setup_default_log(name,log_folder)")
# Usually use default setup
#logger = setup_default_log("hub", LOG_FOLDER)
# shut some mouths...
logging.getLogger("elasticsearch").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("requests").setLevel(logging.ERROR)
logging.getLogger('botocore').setLevel(logging.ERROR)
logging.getLogger('boto3').setLevel(logging.ERROR)