-
Notifications
You must be signed in to change notification settings - Fork 1
/
MdxScraper.py
388 lines (331 loc) · 14 KB
/
MdxScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module: MdxScraper
Author: VimWei
Created: January 14, 2024
Modified: March 3, 2024
Description:
Extract specific words from an MDX dictionary and generate HTML, PDF, or JPG files with ease.
It's an adaptation and upgrade based on the original MdxConverter: https://github.com/noword/MdxConverter
"""
import os
import sys
import time
import json
import shutil
import platform
import tempfile
from pathlib import Path
from collections import OrderedDict
from datetime import datetime, timedelta
import imgkit # pip install imgkit
import pdfkit # pip install pdfkit
import openpyxl # pip install openpyxl
from chardet import detect # pip install chardet
from base64 import b64encode # pip install base64
from bs4 import BeautifulSoup # pip install bs4
from settings import (
INPUT_PATH, INPUT_NAME,
DICTIONARY_PATH, DICTIONARY_NAME,
OUTPUT_PATH, OUTPUT_NAME,
InvalidAction, INVALID_ACTION, INVALID_WORDS_NAME,
PDF_OPTIONS, WKHTMLTOPDF_PATHS,
H1_STYLE, SCRAP_STYLE, ADDITIONAL_STYLES,
)
# import mdict_query
current_script_path = Path(__file__).resolve().parent
path_to_be_added = current_script_path / "lib/mdict-query"
sys.path.append(str(path_to_be_added))
import mdict_query
# Function to open a file with detected encoding
def open_encoding_file(name, default_encoding='utf-8'):
with open(name, 'rb') as f:
raw_data = f.read()
if raw_data.count(b'\n') < 1:
encoding = default_encoding
else:
detection_result = detect(raw_data)
encoding = detection_result['encoding']
confidence = detection_result.get('confidence', 0)
if confidence < 0.5:
encoding = default_encoding
return open(name, encoding=encoding, errors='ignore')
# Function to retrieve words from different file types
def get_words(name):
ext = Path(name).suffix.lower()
return {'.xls': get_words_from_xls,
'.xlsx': get_words_from_xls,
'.json': get_words_from_json,
'.txt': get_words_from_txt,
}[ext](name)
# Function to get words from a JSON file
def get_words_from_json(name):
return json.load(open_encoding_file(name))
# Function to get words from a text file
def get_words_from_txt(name):
result = []
for line in open_encoding_file(name).readlines():
line = line.strip()
if len(line) == 0:
continue
if line.startswith('#'):
result.append({'name': line.strip('#'), 'words': []})
else:
if len(result) == 0:
currentTime = datetime.now().strftime("%Y%m%d-%H%M%S")
result.append({'name': currentTime, 'words': []})
result[-1]['words'].append(line)
return result
# Function to get words from an Excel file
def get_words_from_xls(name):
wb = openpyxl.load_workbook(name, read_only=True)
result = []
for name in wb.sheetnames:
ws = wb[name]
words = [row[0].value for row in ws.iter_rows(min_row=ws.min_row, max_row=ws.max_row, max_col=1)]
words = list(filter(lambda x: x is not None and len(x) > 0, words))
result.append({'name': name, 'words': words})
return result
# Function to retrieve CSS from an MDX dictionary or file
def get_css(soup, mdx_path, dictionary):
css_name = soup.head.link['href']
css_path = Path(mdx_path) / css_name
if css_path.exists():
css = css_path.read_bytes()
elif hasattr(dictionary, '_mdd_db'):
css_key = dictionary.get_mdd_keys('*' + css_name)[0]
css = dictionary.mdd_lookup(css_key)[0]
else:
css = b''
return css.decode('utf-8')
# Function to merge CSS into the HTML soup
def merge_css(soup, mdx_path, dictionary, append_additinal_styles=True):
try:
css = get_css(soup, mdx_path, dictionary)
except Exception as e:
return soup
if append_additinal_styles:
css += ADDITIONAL_STYLES
soup.head.link.decompose()
soup.head.append(soup.new_tag('style', type='text/css'))
soup.head.style.string = css
return soup
# Function to determine image format based on file extension
def get_image_format_from_src(src: str) -> str:
ext = Path(src).suffix.lower()
if ext == '.png':
return 'png'
elif ext in ['.jpg', '.jpeg']:
return 'jpeg'
elif ext == '.gif':
return 'gif'
elif ext == '.webp':
return 'webp'
elif ext == '.svg':
return 'svg'
elif ext in ['.tif', '.tiff']:
return 'tiff'
elif ext == '.bmp':
return 'bmp'
else:
return 'jpg'
# Function to replace image source with base64 data in HTML soup
def grab_images(soup, dictionary):
if not hasattr(dictionary, '_mdd_db'):
return soup
cache = {}
for img in soup.find_all('img'):
if not img.has_attr('src'):
continue
src = img['src']
src_path = src.replace('/', '\\')
if src_path in cache:
img['src'] = cache[src_path]
continue
lookup_src = src_path
if not lookup_src.startswith('\\'):
lookup_src = '\\' + lookup_src
# Lookup image data
imgs = dictionary.mdd_lookup(lookup_src)
if len(imgs) > 0:
# print(f'Got image {src}')
image_format = get_image_format_from_src(src)
base64_str = f'data:image/{image_format};base64,' + b64encode(imgs[0]).decode('ascii')
cache[src_path] = base64_str
img['src'] = base64_str
return soup
# Function to look up a word in the MDX dictionary
def lookup(dictionary, word):
word = word.strip()
definitions = dictionary.mdx_lookup(word)
if len(definitions) == 0:
definitions = dictionary.mdx_lookup(word, ignorecase=True)
if len(definitions) == 0:
definitions = dictionary.mdx_lookup(word.replace('-', ''), ignorecase=True)
if len(definitions) == 0:
return ''
definition = definitions[0]
if definition.startswith('@@@LINK='):
return dictionary.mdx_lookup(definition.replace('@@@LINK=', '').strip())[0].strip()
else:
return definition.strip()
# Function to convert MDX to HTML
def mdx2html(mdx_file, input_file, output_file, invalid_action=InvalidAction.Collect, with_toc=True):
print(f'Looking up words in the dictionary and generating HTML output...\n')
found_count = 0
not_found_count = 0
mdx_file = Path(mdx_file)
dictionary = mdict_query.IndexBuilder(mdx_file)
lessons = get_words(input_file)
right_soup = BeautifulSoup('<body style="font-family:Arial Unicode MS;"><div class="right"></div></body>', 'lxml')
right_soup.find('body').insert_before('\n')
left_soup = BeautifulSoup('<div class="left"></div>', 'lxml')
invalid_words = OrderedDict()
for lesson in lessons:
# print(lesson['name'])
h1 = right_soup.new_tag('h1', id='lesson_' + lesson['name'], style=H1_STYLE)
h1.string = lesson['name']
right_soup.div.append(h1)
a = left_soup.new_tag('a', href='#lesson_' + lesson['name'], **{'class': 'lesson'})
a.string = lesson['name']
left_soup.div.append(a)
left_soup.div.append(left_soup.new_tag('br'))
left_soup.div.append('\n')
invalid = False
for word in lesson['words']:
# print('\t', word)
result = lookup(dictionary, word)
if len(result) == 0: # not found
not_found_count += 1
if invalid_action == InvalidAction.Exit:
sys.exit()
elif invalid_action == InvalidAction.Collect:
if lesson['name'] in invalid_words:
invalid_words[lesson['name']].append(word)
else:
invalid_words[lesson['name']] = [word, ]
continue
elif invalid_action == InvalidAction.OutputWarning:
invalid = True
result = f'<div style="padding:0 0 15px 0"><b>WARNING:</b> "{word}" not found</div>'
else: # invalid_action == InvalidAction.Collect_OutputWarning
if lesson['name'] in invalid_words:
invalid_words[lesson['name']].append(word)
else:
invalid_words[lesson['name']] = [word, ]
invalid = True
result = f'<div style="padding:0 0 15px 0"><b>WARNING:</b> "{word}" not found</div>'
else:
found_count += 1
definition = BeautifulSoup(result, 'lxml')
if right_soup.head is None and definition.head is not None:
right_soup.html.insert_before(definition.head)
right_soup.head.append(right_soup.new_tag('meta', charset='utf-8'))
new_div = right_soup.new_tag("div", style=SCRAP_STYLE)
new_div['id'] = 'word_' + word
new_div['class'] = 'scrapedword'
if definition.body:
new_div.append(definition.body)
right_soup.div.append('\n')
right_soup.div.append(new_div)
a = left_soup.new_tag('a', href='#word_' + word, **{'class': 'word' + (' invalid_word' if invalid else '')})
invalid = False
a.string = word
left_soup.div.append(a)
left_soup.div.append(left_soup.new_tag('br'))
left_soup.div.append('\n')
left_soup.div.append(left_soup.new_tag('br'))
if with_toc:
main_div = right_soup.new_tag('div', **{'class': 'main'})
right_soup.div.wrap(main_div)
right_soup.div.insert_before(left_soup.div)
right_soup = merge_css(right_soup, Path(mdx_file).parent, dictionary, with_toc)
right_soup = grab_images(right_soup, dictionary)
html = str(right_soup).encode('utf-8')
html = html.replace(b'<body>', b'').replace(b'</body>', b'', html.count(b'</body>') - 1)
with open(output_file, "wb") as file:
file.write(html)
if len(invalid_words) > 0:
with open(invalid_words_file, 'w', encoding='utf-8') as fp:
for lesson, words in invalid_words.items():
fp.write(f'#{lesson}\n')
for word in words:
fp.write(word + '\n')
return found_count, not_found_count
# Function to find wkhtmltopdf path
def find_wkhtmltopdf_path():
if platform.system() == 'Windows':
return WKHTMLTOPDF_PATHS['Windows']
elif platform.system() == 'Linux':
return WKHTMLTOPDF_PATHS['Linux'] if Path(WKHTMLTOPDF_PATHS['Linux']).is_file() else WKHTMLTOPDF_PATHS['Linux_alt']
elif platform.system() == 'Darwin':
return WKHTMLTOPDF_PATHS['Darwin']
else:
raise ValueError("Unsupported platform")
# Function to convert MDX to PDF
def mdx2pdf(mdx_file, input_file, output_file, invalid_action=InvalidAction.Collect):
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as temp:
TEMP_FILE = temp.name
result = mdx2html(mdx_file, input_file, TEMP_FILE, invalid_action, False)
print(f'Converting HTML to PDF...\n')
config = pdfkit.configuration(wkhtmltopdf=find_wkhtmltopdf_path())
pdfkit.from_file(TEMP_FILE, output_file, configuration=config, options=PDF_OPTIONS)
os.remove(TEMP_FILE)
return result
# Function to convert MDX to JPG
def mdx2jpg(mdx_file, input_file, output_file, invalid_action=InvalidAction.Collect):
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as temp:
TEMP_FILE = temp.name
result = mdx2html(mdx_file, input_file, TEMP_FILE, invalid_action, False)
print(f'Converting HTML to JPG...\n')
imgkit.from_file(TEMP_FILE, output_file, options={'enable-local-file-access': ''})
os.remove(TEMP_FILE)
return result
def human_readable_duration(seconds):
time_delta = timedelta(seconds=seconds)
hours, remainder = divmod(time_delta.total_seconds(), 3600)
minutes, int_seconds = divmod(remainder, 60)
milliseconds = int((seconds - int(hours) * 3600 - int(minutes) * 60 - int(int_seconds)) * 1000)
parts = []
if int(hours) > 0:
parts.append(f'{int(hours):02d} hours')
if int(minutes) > 0 or int(hours) > 0:
parts.append(f'{int(minutes):02d} minutes')
parts.append(f'{int(int_seconds):02d}.{milliseconds:03d} seconds')
human_readable_time = ''.join(parts)
return human_readable_time
if __name__ == '__main__':
print(f'Welcome to MdxScraper:extract specific words from an MDX dictionary and generate HTML, PDF, or JPG with ease!\n')
start_time = time.time()
input_file = Path(INPUT_PATH) / INPUT_NAME
mdx_file = Path(DICTIONARY_PATH) / DICTIONARY_NAME
currentTime = datetime.now().strftime("%Y%m%d-%H%M%S")
OUTPUT_PATH = Path(OUTPUT_PATH)
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
if OUTPUT_NAME is None:
OUTPUT_NAME = currentTime + '_' + Path(INPUT_NAME).stem + '.html'
output_file = OUTPUT_PATH / OUTPUT_NAME
else:
OUTPUT_NAME = currentTime + '_' + OUTPUT_NAME
output_file = OUTPUT_PATH / OUTPUT_NAME
invalid_words_file = OUTPUT_PATH / (currentTime + '_' + INVALID_WORDS_NAME)
backup_input_file = OUTPUT_PATH / (currentTime + '_backup_' + INPUT_NAME)
shutil.copy(str(input_file), str(backup_input_file))
output_type = Path(output_file).suffix[1:]
found, not_found = {
'html': mdx2html,
'pdf': mdx2pdf,
'jpg': mdx2jpg,
}[output_type](mdx_file, input_file, output_file, INVALID_ACTION)
if found >0 or INVALID_ACTION in [InvalidAction.OutputWarning, InvalidAction.Collect_OutputWarning]:
print(f"Success: {found} words extracted from {Path(DICTIONARY_NAME).name}. Refer to {output_file}.\n")
else:
print(f"Success: {found} words extracted from {Path(DICTIONARY_NAME).name}.\n")
if not_found > 0:
print(f"Failure: {not_found} words not in {Path(DICTIONARY_NAME).name}. Check {invalid_words_file}.\n")
else:
print(f"Failure: {not_found} words not in {Path(DICTIONARY_NAME).name}.\n")
end_time = time.time()
duration = human_readable_duration(end_time - start_time)
print(f"The entire process took a total of {duration}.\n")