Skip to content

Commit b139240

Browse files
authored
Create import_onenote_zip.py
copy-paste from a private repo, that's not in a fit-to-share state!
1 parent 1d87c5a commit b139240

1 file changed

Lines changed: 283 additions & 0 deletions

File tree

metril/import_onenote_zip.py

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
"""Process OneNote zip archive and prepare it for Trilium import.
2+
3+
Make OneNote HTML exports trilium compatible.
4+
- Identifies and converts the page title (first h1 or p with larger font size) to a proper h1 tag
5+
- Italicizes the date after fixing the title
6+
- remove filelist.xml files (not needed)
7+
"""
8+
9+
import os
10+
import re
11+
import zipfile
12+
from typing import Optional, Tuple
13+
from bs4 import BeautifulSoup, Tag
14+
from dataclasses import dataclass
15+
from dotenv import load_dotenv
16+
17+
# Load environment configuration
18+
if load_dotenv():
19+
server_url = os.getenv('TRILIUM_HOST')
20+
token = os.getenv('TRILIUM_TOKEN')
21+
else:
22+
print(".env not found")
23+
print("Please set TRILIUM_HOST and TRILIUM_TOKEN environment variables - https://github.com/zadam/trilium/wiki/ETAPI")
24+
exit(1)
25+
26+
import click
27+
from trilium_py.client import ETAPI
28+
29+
@dataclass
30+
class FontConfig:
31+
"""Configuration for font size processing"""
32+
default_size: float = 11.0
33+
size_threshold_multiplier: float = 1.1 # Title should be >10% larger than base
34+
35+
def extract_font_size(style: Optional[str]) -> Optional[float]:
36+
"""Extract font size value from CSS style string.
37+
38+
Args:
39+
style: CSS style string that may contain font-size
40+
41+
Returns:
42+
Float value of font size if found, None otherwise
43+
"""
44+
if not style or 'font-size:' not in style:
45+
return None
46+
47+
try:
48+
font_str = style.split('font-size:')[1].split(';')[0]
49+
return float(re.search(r'\d+(?:\.\d+)?', font_str).group())
50+
except (IndexError, AttributeError):
51+
return None
52+
53+
def get_base_font_size(soup: BeautifulSoup, config: FontConfig) -> float:
54+
"""Extract base font size from body tag or return default.
55+
56+
Args:
57+
soup: BeautifulSoup parsed HTML
58+
config: Font configuration settings
59+
60+
Returns:
61+
Base font size value
62+
"""
63+
body = soup.find('body')
64+
if body and (font_size := extract_font_size(body.get('style'))):
65+
print(f'Found body font size: {font_size}')
66+
return font_size
67+
68+
print(f'Using default font size: {config.default_size}')
69+
return config.default_size
70+
71+
def process_title_tag(tag: Tag, base_font_size: float, config: FontConfig) -> Optional[str]:
72+
"""Process a potential title tag and convert to h1 if appropriate.
73+
74+
Args:
75+
tag: HTML tag to process
76+
base_font_size: Base font size for comparison
77+
config: Font configuration settings
78+
79+
Returns:
80+
Processed title text if tag was converted to h1, None otherwise
81+
"""
82+
if not tag.get('style'):
83+
return None
84+
85+
font_size = extract_font_size(tag.get('style'))
86+
if not font_size or font_size <= base_font_size * config.size_threshold_multiplier:
87+
return None
88+
89+
# Clean and normalize title text
90+
title = re.sub(r'\s+', ' ', tag.get_text())
91+
tag.name = 'h1'
92+
tag.string = title
93+
return title
94+
95+
def update_html_title(soup: BeautifulSoup, title: str) -> None:
96+
"""Update both the <title> tag and create/update <h1> with the given title.
97+
98+
Args:
99+
soup: BeautifulSoup parsed HTML
100+
title: Title text to set
101+
"""
102+
# Update or create title tag in head
103+
head = soup.find('head')
104+
if not head:
105+
head = soup.new_tag('head')
106+
if soup.html:
107+
soup.html.insert(0, head)
108+
else:
109+
html = soup.new_tag('html')
110+
html.append(head)
111+
soup.append(html)
112+
113+
title_tag = head.find('title')
114+
if not title_tag:
115+
title_tag = soup.new_tag('title')
116+
head.append(title_tag)
117+
title_tag.string = title
118+
119+
# Update or create h1 tag in body
120+
h1_tag = soup.find('h1')
121+
if not h1_tag:
122+
h1_tag = soup.new_tag('h1')
123+
body = soup.find('body')
124+
if body:
125+
if body.contents:
126+
body.insert(0, h1_tag)
127+
else:
128+
body.append(h1_tag)
129+
h1_tag.string = title
130+
131+
def italicize_first_date(soup: BeautifulSoup) -> None:
132+
"""Find and wrap date/time paragraphs in semantic HTML5 time tags.
133+
134+
Args:
135+
soup: BeautifulSoup parsed HTML
136+
"""
137+
h1_tag = soup.find('h1')
138+
if not h1_tag:
139+
return
140+
141+
# Find the first two p tags after h1 (date and time)
142+
date_p = h1_tag.find_next('p') # First p tag is the date
143+
if date_p:
144+
# Create time tag with datetime attribute
145+
date_text = date_p.get_text().strip()
146+
time_tag = soup.new_tag('time')
147+
try:
148+
# Try to parse the date to a standard format for the datetime attribute
149+
# OneNote format is typically YYYY-MMM-DD
150+
from datetime import datetime
151+
parsed_date = datetime.strptime(date_text, '%Y-%b-%d')
152+
time_tag['datetime'] = parsed_date.strftime('%Y-%m-%d')
153+
except ValueError:
154+
# If we can't parse the date, just use the text as is
155+
time_tag['datetime'] = date_text
156+
157+
date_p.string = date_text
158+
date_p.string.wrap(time_tag)
159+
160+
# Get the next p tag (time)
161+
time_p = date_p.find_next('p')
162+
if time_p:
163+
time_text = time_p.get_text().strip()
164+
time_tag = soup.new_tag('time')
165+
try:
166+
# Try to parse the time
167+
parsed_time = datetime.strptime(time_text, '%I:%M %p')
168+
time_tag['datetime'] = parsed_time.strftime('%H:%M')
169+
except ValueError:
170+
time_tag['datetime'] = time_text
171+
172+
time_p.string = time_text
173+
time_p.string.wrap(time_tag)
174+
175+
def convert_page_title_to_h1(html: bytes, config: FontConfig = FontConfig()) -> str:
176+
"""Convert OneNote HTML page title to proper h1 tag and process date.
177+
178+
Args:
179+
html: Raw HTML content
180+
config: Font configuration settings
181+
182+
Returns:
183+
Processed HTML with proper h1 title and italicized date
184+
"""
185+
soup = BeautifulSoup(html.decode('utf-8'), 'html.parser')
186+
base_font_size = get_base_font_size(soup, config)
187+
188+
# Process potential title tags
189+
title = None
190+
for tag in soup.find_all(['h1', 'p'])[:5]: # Check first 5 candidates
191+
try:
192+
if title := process_title_tag(tag, base_font_size, config):
193+
print(f'Found and converted title: {title}')
194+
# Remove the original tag since we'll create a new one
195+
tag.decompose()
196+
break
197+
except Exception as e:
198+
print(f'Error processing tag: {tag} - {str(e)}')
199+
continue
200+
201+
if title:
202+
update_html_title(soup, title)
203+
204+
# Process the date after finding the title
205+
try:
206+
italicize_first_date(soup)
207+
print('Processed date paragraph')
208+
except Exception as e:
209+
print(f'Error processing date: {str(e)}')
210+
211+
new_html = soup.prettify()
212+
213+
# Debug output
214+
new_title_index = new_html.find('title')
215+
if new_title_index >= 0:
216+
new_title = new_html[new_title_index - 1:new_title_index + 50]
217+
print(f'New title:\n{new_title}\n')
218+
body_index = new_html.find('<body')
219+
if body_index >= 0:
220+
preview = new_html[body_index:body_index + 700]
221+
print(f'Preview of processed HTML body:\n{preview}\n')
222+
223+
return new_html
224+
225+
@click.command(help="Process OneNote ZIP export for Trilium import")
226+
@click.argument("zip_path", required=True)
227+
@click.option('--keep-filelist', is_flag=True, help='Keep filelist.xml files (removed by default)')
228+
@click.option('--debug', is_flag=True, help='Show debug information')
229+
def process_zip(zip_path: str, keep_filelist: bool = False, debug: bool = False) -> str:
230+
"""Process OneNote ZIP export and prepare for Trilium import.
231+
232+
Args:
233+
zip_path: Path to OneNote ZIP export
234+
keep_filelist: If True, keep filelist.xml files, otherwise remove them
235+
debug: If True, show additional debug information
236+
237+
Returns:
238+
Path to processed ZIP file
239+
"""
240+
os.chdir('./out')
241+
output_zip_path = 'fixed_' + os.path.basename(zip_path)
242+
243+
with open(zip_path, 'rb') as f:
244+
with zipfile.ZipFile(f) as input_zip:
245+
# Debug: List all files in zip
246+
if debug:
247+
print("\nFiles in ZIP:")
248+
for file_info in input_zip.filelist:
249+
print(f" {file_info.filename} ({file_info.file_size:,} bytes)")
250+
print()
251+
252+
with zipfile.ZipFile(output_zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as output_zip:
253+
for file_info in input_zip.filelist:
254+
filename = file_info.filename
255+
256+
# Skip filelist.xml files unless explicitly kept
257+
if 'filelist.xml' in filename and not keep_filelist:
258+
print(f'Skipping: {filename}')
259+
continue
260+
261+
with input_zip.open(filename) as source:
262+
if filename.endswith('.htm'):
263+
fixed_html = convert_page_title_to_h1(source.read())
264+
output_zip.writestr(filename, fixed_html)
265+
if debug:
266+
print(f'Processed HTML in: {filename} ({file_info.file_size:,} bytes)')
267+
else:
268+
print(f'Processed HTML in: {filename}')
269+
else:
270+
output_zip.writestr(filename, source.read())
271+
if debug:
272+
print(f'Copied: {filename} ({file_info.file_size:,} bytes)')
273+
elif filename.endswith('/'): # Only print directory entries in non-debug mode
274+
print(f'Copied: {filename}')
275+
276+
print(f'\nCreated processed ZIP file: {output_zip_path}')
277+
return output_zip_path
278+
279+
# Alias for backward compatibility
280+
get_zip = process_zip
281+
282+
if __name__ == '__main__':
283+
process_zip()

0 commit comments

Comments
 (0)