-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv.py
executable file
·116 lines (86 loc) · 3.69 KB
/
csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
# Creates a '.csv' file containing a full list of site URLs to add to a spreadsheet for large-scale docs reviews.
import os
import csv
from html.parser import HTMLParser
from utils.root import get_project_root
# Use HTMLParser to parse through the generated site's primary './build/index.html' file.
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.recording = 0
self.data = []
self.temp_data = ('', '')
self.ignore_list = ['api-reference', 'widget', 'release-notes']
self.title = ''
def handle_starttag(self, tag, attrs):
if tag == 'a':
if self.recording:
self.recording += 1
for name, value in attrs:
if name == 'class' and value == 'nav-link':
self.recording = 1
if self.recording and name == 'href':
if any(item in value for item in self.ignore_list) and value != '/release-notes/index.html':
self.recording = 0
else:
self.temp_data = (self.title, '', value)
elif tag == 'title':
self.recording = 2
def handle_endtag(self, tag):
if tag == 'a' and self.recording == 1:
self.recording -= 1
if self.recording == 0:
self.data.append(self.temp_data)
elif tag == 'title':
self.recording = 0
def handle_data(self, data):
if self.recording == 1:
self.temp_data = (self.temp_data[0], data, self.temp_data[2])
elif self.recording == 2:
self.title = data
# Create output file
output_file = "site-urls.csv"
# Extracts the section from a given href.
def extract_section_from_href(href):
# Split the href using '/'
parts = href.split('/')
# Remove any empty strings due to leading or trailing slashes
parts = [part for part in parts if part]
# Extract everything except the last part
section = "/".join(parts[:-1])
return section
# Formats the section path for display.
def format_section(section):
# Split the section by '/'
parts = section.split('/')
# Convert each part to title case
parts = [part.replace('-', ' ').title() for part in parts]
# Join the parts with ' > ' and return
return ' > '.join(parts)
# Main function to create the CSV file containing site URLs and metadata.
def main():
# Set the directory to search using get_project_root() directly
search_directory = os.path.join(get_project_root(), 'build/docs/')
# Parse the index.html to get links
with open(os.path.join(search_directory, 'index.html'), 'r') as f:
parser = MyHTMLParser()
parser.feed(f.read())
# Open the output CSV file
with open(os.path.join(get_project_root(), output_file), 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write CSV headers
writer.writerow(['SME', 'Status', 'Implemented', 'Doc Section', 'Doc Title', 'Doc Link', 'Review Link'])
# For each link, write the data to the CSV
for _, doc_title, href in parser.data:
doc_section = extract_section_from_href(href)
# Format the doc_section
formatted_section = format_section(doc_section)
# If the doc_section is blank, set it to doc_title
if not formatted_section:
formatted_section = doc_title.title()
# Updated the doc_link structure to reflect new URL pattern
doc_link = f"https://docs.kobiton.com/{href}"
writer.writerow(['', '', '', formatted_section, doc_title, doc_link, ''])
if __name__ == "__main__":
main()