-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpinchy.cc.psm1
175 lines (144 loc) · 3.75 KB
/
pinchy.cc.psm1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
Function ConvertTo-NormalHTML {
# https://stackoverflow.com/questions/56187543/invoke-webrequest-freezes-hangs
param([Parameter(Mandatory = $true, ValueFromPipeline = $true)]$HTML)
$NormalHTML = New-Object -Com "HTMLFile"
$NormalHTML.IHTMLDocument2_write($HTML.RawContent)
return $NormalHTML
}
function Import-CommonsPicture {
param (
[string]$url
)
#
# Fetch the content from the URL.
#
try {
$content = Invoke-WebRequest -Uri $url
} catch {
Write-Error "Failed to download content from $url. Error: $_"
return
}
#
# Create an array to hold the front matter
#
$lines = @()
#
# --- (start of front matter)
#
$lines += "---"
#
# title: "..." (must be enclosed in quotes)
#
# The title on Wikimedia Commons has the following format:
# "File:filename.svg - Wikimedia Commons"
# Extract the text between File: and - Wikimedia Commons
$title = $content.ParsedHtml.querySelector("title").innerText
$title = $title -replace "^File:", ""
$title = $title -replace " - Wikimedia Commons$", " (Wikimedia Commons)"
$lines += "title: `"$title`""
#
# type: picture
#
$lines += "type: picture"
#
# url: /path/to/file - but remove the File: namespace in the path
#
$localUrl = "/" + ($url -replace "https://", "") + "/"
$localUrl = $localUrl -replace "/wiki/File:", "/wiki/"
$lines += "url: $localUrl"
#
# website: url
#
$lines += "website: `"$url`""
# tags:
# - flag
# - Wikimedia Commons
$lines += "tags:"
$lines += " - Wikimedia Commons"
#
# --- (end of YAML front matter)
#
$lines += "---"
#
# Write the content to a file
#
$outputPath = "commons.wikimedia.org.md"
$lines | Out-File -FilePath $outputPath -Encoding utf8
}
function Import-Wikipedia {
param (
[string]$url
)
#
# Fetch the content from the URL.
#
try {
#
# Note: Invoke-WebRequest is used with -UseBasicParsing
# to avoid a hang that occurs randomly. For more info, see:
# https://stackoverflow.com/questions/56187543/invoke-webrequest-freezes-hangs
#
$response = Invoke-WebRequest -Uri $url -UseBasicParsing
} catch {
Write-Error "Failed to download content from $url. Error: $_"
return
}
#
# Manually parse the HTML using the StackOverflow solution.
#
$parsed = ConvertTo-NormalHTML -HTML $response
#
# Create an array to hold the front matter
#
$lines = @()
#
# --- (start of front matter)
#
$lines += "---"
#
# title: "..." (must be enclosed in quotes)
#
$title = $parsed.title #$parsed.querySelector("title").innerText
$title = $title -replace " - Wikipedia$", ""
$lines += "title: `"$title (Wikipedia)`""
#
# license: CC BY-SA 4.0
#
$lines += "license: CC BY-SA 4.0"
#
# retrieved: yyyy-MM-dd
#
$lines += "retrieved: " + (Get-Date -format "yyyy-MM-dd")
#
# type: website
#
$lines += "type: website"
#
# url: /en.wikipedia.org/Wiki/File/
#
$lines += "url: /" + ($url -replace "https?://", "" )+ "/"
#
# website: "..."
#
$lines += "website: `"$url`""
#
# wikipedia of: title
#
$lines += "wikipedia of: $title"
#
# tags:
#
$lines += "tags:"
$lines += " - Wikipedia"
#
# --- (end of front matter)
#
$lines += "---"
#
# Write the content to a file
#
$outputPath = "en.wikipedia.org.md"
$lines | Out-File -FilePath $outputPath -Encoding utf8
}
Export-ModuleMember -Function Import-CommonsPicture
Export-ModuleMember -Function Import-Wikipedia