Skip to content

Commit

Permalink
batched writing to disk (#659)
Browse files Browse the repository at this point in the history
instead of writing result line by line, implemented random.choices for randomisation of multiple stations and writing large batche ot the disk, also instead of "round" just using :.1f which is probably quicker on a large scale, because it's not a mathematical function
  • Loading branch information
nickyreinert authored Jan 31, 2024
1 parent 0c5c228 commit f553179
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions src/main/python/create_measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,18 @@ def build_test_data(weather_station_names, num_rows_to_create):
coldest_temp = -99.9
hottest_temp = 99.9
station_names_10k_max = random.choices(weather_station_names, k=10_000)
progress_step = max(1, int(num_rows_to_create / 100))
batch_size = 10000 # instead of writing line by line to file, process a batch of stations and put it to disk
progress_step = max(1, (num_rows_to_create // batch_size) // 100)
print('Building test data...')

try:
with open("../../../data/measurements.txt", 'w') as file:
for s in range(0,num_rows_to_create):
random_station = random.choice(station_names_10k_max)
random_temp = round(random.uniform(coldest_temp, hottest_temp), 1)
file.write(f"{random_station};{random_temp}\n")
for s in range(0,num_rows_to_create // batch_size):

batch = random.choices(station_names_10k_max, k=batch_size)
prepped_deviated_batch = '\n'.join([f"{station};{random.uniform(coldest_temp, hottest_temp):.1f}" for station in batch]) # :.1f should quicker than round on a large scale, because round utilizes mathematical operation
file.write(prepped_deviated_batch + '\n')

# Update progress bar every 1%
if s % progress_step == 0 or s == num_rows_to_create - 1:
sys.stdout.write('\r')
Expand Down

0 comments on commit f553179

Please sign in to comment.