-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_md.py
52 lines (39 loc) · 1.88 KB
/
process_md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import os
from pathlib import Path
def process_markdown(content):
# Remove front matter (content between +++ or --- markers at start of file)
content = re.sub(r'^[+-]{3}.*?[+-]{3}\s*', '', content, flags=re.DOTALL)
# Remove table of contents section if it exists (assumes it's between <!-- TOC --> markers)
content = re.sub(r'<!-- TOC -->.*?<!-- /TOC -->', '', content, flags=re.DOTALL)
# Remove HTML comments
content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
# Remove markdown links but keep the text
content = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', content)
# Remove markdown headers (#)
content = re.sub(r'^#+\s*(.+)$', r'\1', content, flags=re.MULTILINE)
# Remove emphasis marks (* and _) but keep the text
content = re.sub(r'[*_]{1,2}([^*_]+)[*_]{1,2}', r'\1', content)
# Remove images
content = re.sub(r'!\[.*?\]\(.*?\)', '', content)
# Remove TOML configuration sections
content = re.sub(r'\[\[.*?\]\].*?(?=\n\n|\Z)', '', content, flags=re.DOTALL)
# Clean up extra whitespace
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
return content.strip()
base_dir = '/Users/micn/Documents/code/ftl/docs/content'
output = []
# Process each markdown file
for root, dirs, files in os.walk(base_dir):
for file in files:
if file.endswith('.md'):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
processed = process_markdown(content)
if processed.strip(): # Only include non-empty content
output.append(f"\n\n=== {os.path.relpath(file_path, base_dir)} ===\n\n")
output.append(processed)
# Write the combined output
with open('ftl.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(output))