From ff5950690a699dbf69daf3c218d9e94d31773b18 Mon Sep 17 00:00:00 2001
From: tsteven4 <13596209+tsteven4@users.noreply.github.com>
Date: Sat, 14 Sep 2024 12:34:35 -0600
Subject: [PATCH 01/21] rewrite strip_html with QString.
---
util.cc | 97 +++++++++++++++++++++++----------------------------------
1 file changed, 39 insertions(+), 58 deletions(-)
diff --git a/util.cc b/util.cc
index 7840d5268..aa153106a 100644
--- a/util.cc
+++ b/util.cc
@@ -838,82 +838,63 @@ QString strip_html(const QString& utfstring)
doc.setHtml(utfstring);
return doc.toPlainText().simplified();
#else
- char* out;
- char* instr;
- char tag[8];
- unsigned short int taglen = 0;
-
- char* incopy = instr = xstrdup(utfstring);
- /*
- * We only shorten, so just dupe the input buf for space.
- */
- char* outstring = out = xstrdup(utfstring);
-
- tag[0] = 0;
- while (*instr) {
+ QString tag;
+ bool processing_tag = false;
+ QString out;
+
+ for (auto instr = utfstring.cbegin(), end = utfstring.cend(); instr != end;) {
if ((*instr == '<') || (*instr == '&')) {
- tag[0] = *instr;
- taglen = 0;
+ processing_tag = true;
}
- if (! tag[0]) {
+ if (!processing_tag) {
if (*instr == '\n') {
- *out++ = ' ';
+ out.append(' ');
do {
instr++;
- } while (isspace(*instr));
+ } while ((instr != end) && instr->isSpace());
continue;
} else {
- *out++ = *instr;
+ out.append(*instr);
}
} else {
- if (taglen < (sizeof(tag)-1)) {
- tag[taglen++] = tolower(*instr);
- tag[taglen] = 0;
+ if (tag.size() < 7) {
+ tag.append(instr->toLower());
}
}
- if (((tag[0] == '<') && (*instr == '>')) ||
- ((tag[0] == '&') && (*instr == ';'))) {
- if (! strcmp(tag, "&")) {
- *out++ = '&';
- } else if (! strcmp(tag, "<")) {
- *out++ = '<';
- } else if (! strcmp(tag, ">")) {
- *out++ = '>';
- } else if (! strcmp(tag, """)) {
- *out++ = '"';
- } else if (! strcmp(tag, " ")) {
- *out++ = ' ';
- } else if (! strcmp(tag, "°")) {
- *out++ = 'd';
- *out++ = 'e';
- *out++ = 'g';
- } else if ((tag[0]=='<') && (tag[1]=='p')) {
- *out++ = '\n';
- } else if ((tag[0]=='<') && (tag[1]=='b') && (tag[2]=='r')) {
- *out++ = '\n';
- } else if ((tag[0]=='<') && (tag[1]=='/') && (tag[2]=='t') && (tag[3]=='r')) {
- *out++ = '\n';
- } else if ((tag[0]=='<') && (tag[1]=='/') && (tag[2]=='t') && (tag[3]=='d')) {
- *out++ = ' ';
- } else if ((tag[0]=='<') && (tag[1]=='i') && (tag[2]=='m') && (tag[3]=='g')) {
- *out++ = '[';
- *out++ = 'I';
- *out++ = 'M';
- *out++ = 'G';
- *out++ = ']';
+ if ((tag.startsWith('<') && (*instr == '>')) ||
+ (tag.startsWith('&') && (*instr == ';'))) {
+ if (tag == "&") {
+ out.append('&');
+ } else if (tag == "<") {
+ out.append('<');
+ } else if (tag == ">") {
+ out.append('>');
+ } else if (tag == """) {
+ out.append('"');
+ } else if (tag == " ") {
+ out.append(' ');
+ } else if (tag == "°") {
+ out.append("deg");
+ } else if (tag.startsWith("
Date: Sat, 14 Sep 2024 15:55:56 -0600
Subject: [PATCH 02/21] rewrite strip_nasty_html in Qt.
and actually produce valid html:
1. the replacement for "
", "", is invalid.
2. leaving an html tag in causes the html format output to be invalid.
---
reference/gc/GCGCA8_nasty.gpx | 80 +++++++++++++++++++++++++++
reference/gc/GCGCA8_nasty.html | 55 +++++++++++++++++++
testo.d/text.test | 4 ++
util.cc | 99 +++++++---------------------------
4 files changed, 158 insertions(+), 80 deletions(-)
create mode 100644 reference/gc/GCGCA8_nasty.gpx
create mode 100644 reference/gc/GCGCA8_nasty.html
diff --git a/reference/gc/GCGCA8_nasty.gpx b/reference/gc/GCGCA8_nasty.gpx
new file mode 100644
index 000000000..ebcac6636
--- /dev/null
+++ b/reference/gc/GCGCA8_nasty.gpx
@@ -0,0 +1,80 @@
+
+
+ Cache Listing Generated from Geocaching.com
+ This is an individual cache generated from Geocaching.com
+ Account "robertlipe" From Geocaching.com
+ contact@geocaching.com
+ https://www.geocaching.com
+ Geocaching - High Tech Treasure Hunting
+ 2023-10-25T00:44:53.7176739Z
+ cache, geocache
+
+
+ 2003-06-29T00:00:00
+ GCGCA8
+ Oozy rat in a sanitary zoo by robertlipe, Unknown Cache (3/2)
+ https://www.geocaching.com/geocache/GCGCA8
+ Oozy rat in a sanitary zoo
+ Geocache
+ Geocache|Unknown Cache
+
+ Oozy rat in a sanitary zoo
+ robertlipe
+ robertlipe
+ Unknown Cache
+ Not chosen
+
+ Wheelchair accessible
+ Ticks
+ Dangerous animals
+ Poisonous plants
+ Thorns
+ Picnic tables nearby
+ Public restrooms nearby
+ Dogs
+
+ 3
+ 2
+ United States
+ Tennessee
+ <body>The cache is <style>
+not</style> at the coordinates above. These coords will get
+you to the correct park and within 1/2 mile of the cache. The cache
+is within 35 feet of the trail. It is not handicapped accessible.
+It is a nice walk in the woods that is practical for all ages.
+There is no space in the container for trading items. You should
+bring a writing stick and bug spray is recommended.</body>
+
+ <html><body text="color">So if the cache isn't at the above coordinates, where is it?
+<ul>
+<li>Too bad I hid a boot</li>
+<li>Too hot to hoot</li>
+<li>Never odd or even</li>
+<li>Do geese see God?</li>
+<li>"Do nine men interpret?" "Nine men," I nod</li>
+<li>Rats live on no evil star</li>
+<li>Go hang a salami, I'm a lasagna hog</li></ul>
+Now that it's intuitively obvious to even the most casual observer
+where the cache is, turn on your geo-mojo and go find it. <br>
+<image src="http://www.mtgc.org/mtgc_member-banner.gif" width="500"
+height="40" alt=
+"Member of Middle Tennessee GeoCachers Club [www.mtgc.org]"
+ border="0"><br>
+<br></body></html>
+
+
+
+
+
+ 2017-11-11T01:44:14Z
+ Archive
+ robertlipe
+ Removed the container from the final location. Enough construction has occurred since this was placed to make it much less of an adventure than is used to be, so I'm archiving.
+
+Thanx to all that hunted it.
+
+
+
+
+
+
diff --git a/reference/gc/GCGCA8_nasty.html b/reference/gc/GCGCA8_nasty.html
new file mode 100644
index 000000000..d6c592ce1
--- /dev/null
+++ b/reference/gc/GCGCA8_nasty.html
@@ -0,0 +1,55 @@
+
+
+
+
+ GPSBabel HTML Output
+
+
+
+
+ GCGCA8 - Oozy rat in a sanitary zoo
+
+
+
+
+
+ GCGCA8 - N35°55.300 W86°51.700 (16S 512480 3975269)
+Oozy rat in a sanitary zoo by robertlipe
+
+
+ 3 / 2
+Unknown Cache / Unknown
+
+
+
+
+ The cache is at the coordinates above. These coords will get
+you to the correct park and within 1/2 mile of the cache. The cache
+is within 35 feet of the trail. It is not handicapped accessible.
+It is a nice walk in the woods that is practical for all ages.
+There is no space in the container for trading items. You should
+bring a writing stick and bug spray is recommended.
+ So if the cache isn't at the above coordinates, where is it?
+
+Too bad I hid a boot
+Too hot to hoot
+Never odd or even
+Do geese see God?
+"Do nine men interpret?" "Nine men," I nod
+Rats live on no evil star
+Go hang a salami, I'm a lasagna hog
+Now that it's intuitively obvious to even the most casual observer
+where the cache is, turn on your geo-mojo and go find it.
+
+
+
+
+
+
+
+
diff --git a/testo.d/text.test b/testo.d/text.test
index e89b52226..4307260bb 100644
--- a/testo.d/text.test
+++ b/testo.d/text.test
@@ -16,3 +16,7 @@ gpsbabel -i gpx -f ${REFERENCE}/gc/GC7FA4.gpx \
-o text,logs -F ${TMPDIR}/GC7FA4.text
compare ${REFERENCE}/gc/GC7FA4.html ${TMPDIR}/GC7FA4.html
compare ${REFERENCE}/gc/GC7FA4.text ${TMPDIR}/GC7FA4.text
+
+# GCGC8_nasty.gpx is hand modifed to test strip_nasty_html
+gpsbabel -i gpx -f ${REFERENCE}/gc/GCGCA8_nasty.gpx -o html -F ${TMPDIR}/GCGCA8_nasty.html
+compare ${REFERENCE}/gc/GCGCA8_nasty.html ${TMPDIR}/GCGCA8_nasty.html
diff --git a/util.cc b/util.cc
index aa153106a..7aee18cf1 100644
--- a/util.cc
+++ b/util.cc
@@ -20,6 +20,7 @@
*/
#include // for sort
+#include // for assert
#include // for isspace, tolower
#include // for errno
#include // for INT_MAX, INT_MIN
@@ -35,6 +36,7 @@
#include // for QDateTime
#include // for QFileInfo
#include // for QList
+#include // for QRegularExpressio
#include // for QString
#include // for QTextBoundaryFinder, QTextBoundaryFinder::Grapheme
#include // for QTextCodec
@@ -734,92 +736,29 @@ pretty_deg_format(double lat, double lon, char fmt, const char* sep, bool html)
/*
* Get rid of potentially nasty HTML that would influence another record
* that includes;
- * - to stop backgrounds/background colours from being loaded
+ * - to stop backgrounds/background colors from being loaded
* and - stop processing altogether
* - stop overriding styles for everything
*/
QString
strip_nastyhtml(const QString& in)
{
- char* returnstr = xstrdup(in);
- char* lcstr = xstrdup(in.toLower());
-
- while (char* lcp = strstr(lcstr, "")) {
- char* sp = returnstr + (lcp - lcstr) ; /* becomes */
- sp++;
- *sp++ = '!';
- *sp++ = ' ';
- *sp++ = ' ';
- *sp++ = ' ';
- *lcp = '*'; /* so we wont find it again */
- }
- while (char* lcp = strstr(lcstr, " */
- char* sp = returnstr + (lcp - lcstr) ;
- sp++;
- *sp++ = '!';
- *sp++ = '-';
- *sp++ = '-';
- while ((*sp) && (*sp != '>')) {
- sp++;
- }
- *--sp = '-';
- *--sp = '-';
- *lcp = '*'; /* so we wont find it again */
- }
- while (char* lcp = strstr(lcstr, "")) {
- char* sp = returnstr + (lcp - lcstr) ; /* becomes */
- *sp++ = ' ';
- *sp++ = ' ';
- *sp++ = ' ';
- *sp++ = ' ';
- *sp++ = ' ';
- *sp++ = '-';
- *sp++ = '-';
- *lcp = '*'; /* so we wont find it again */
- }
- while (char* lcp = strstr(lcstr, "", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(htmlre.isValid());
+ static const QRegularExpression bodyre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(bodyre.isValid());
+ static const QRegularExpression stylere(".*?", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(stylere.isValid());
+ QString out(in);
+
+ out.replace(bodyre, "");
+ out.replace("", "", Qt::CaseInsensitive);
+ out.replace(htmlre, "");
+ out.replace("", "", Qt::CaseInsensitive);
+ out.replace(stylere, "");
+ out.replace("
Date: Sat, 14 Sep 2024 18:26:52 -0600
Subject: [PATCH 03/21] cleanup comment xstrdup
---
exif.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/exif.cc b/exif.cc
index cece53d16..8cf300fda 100644
--- a/exif.cc
+++ b/exif.cc
@@ -63,7 +63,7 @@
#include // for memcmp, strlen
#include // for as_const
-#include "defs.h" // for Waypoint, fatal, warning, global_options, global_opts, unknown_alt, xfree, route_disp_all, track_disp_all, waypt_disp_all, wp_flags, KNOTS_TO_MPS, KPH_TO_MPS, MPH_TO_MPS, MPS_TO_KPH, WAYPT_HAS, case_ignore_strcmp, waypt_add, xstrdup, fix_2d
+#include "defs.h" // for Waypoint, fatal, warning, global_options, global_opts, unknown_alt, xfree, route_disp_all, track_disp_all, waypt_disp_all, wp_flags, KNOTS_TO_MPS, KPH_TO_MPS, MPH_TO_MPS, MPS_TO_KPH, WAYPT_HAS, case_ignore_strcmp, waypt_add, fix_2d
#include "garmin_tables.h" // for gt_lookup_datum_index
#include "gbfile.h" // for gbfputuint32, gbfputuint16, gbfgetuint16, gbfgetuint32, gbfseek, gbftell, gbfile, gbfclose, gbfcopyfrom, gbfwrite, gbfopen_be, gbfread, gbfrewind, gbfgetflt, gbfgetint16, gbfopen, gbfputc, gbfputflt, gbsize_t, gbfeof, gbfgetdbl, gbfputdbl, gbfile::(anonymous)
#include "jeeps/gpsmath.h" // for GPS_Math_WGS84_To_Known_Datum_M
From 677da95a77b335bfa686a35fc33e862f8b50f053 Mon Sep 17 00:00:00 2001
From: tsteven4 <13596209+tsteven4@users.noreply.github.com>
Date: Sun, 15 Sep 2024 06:48:48 -0600
Subject: [PATCH 04/21] use regex for strip_html
---
util.cc | 80 ++++++++++++++++++---------------------------------------
1 file changed, 25 insertions(+), 55 deletions(-)
diff --git a/util.cc b/util.cc
index 7aee18cf1..35cde0c13 100644
--- a/util.cc
+++ b/util.cc
@@ -777,62 +777,32 @@ QString strip_html(const QString& utfstring)
doc.setHtml(utfstring);
return doc.toPlainText().simplified();
#else
- QString tag;
- bool processing_tag = false;
- QString out;
+ static const QRegularExpression pre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(pre.isValid());
+ static const QRegularExpression brre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(brre.isValid());
+ static const QRegularExpression trre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(trre.isValid());
+ static const QRegularExpression tdre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(tdre.isValid());
+
+ QString out(utfstring);
+
+ // Tag replacement first
+ out.replace(pre, "\n");
+ out.replace(brre, "\n");
+ out.replace(trre, "\n");
+ out.replace(tdre, " ");
+ out.replace(" ");
+ out.replace(""", "\"");
+ out.replace(" ", " ");
+ out.replace("°", "deg");
- for (auto instr = utfstring.cbegin(), end = utfstring.cend(); instr != end;) {
- if ((*instr == '<') || (*instr == '&')) {
- processing_tag = true;
- }
-
- if (!processing_tag) {
- if (*instr == '\n') {
- out.append(' ');
- do {
- instr++;
- } while ((instr != end) && instr->isSpace());
- continue;
- } else {
- out.append(*instr);
- }
- } else {
- if (tag.size() < 7) {
- tag.append(instr->toLower());
- }
- }
-
- if ((tag.startsWith('<') && (*instr == '>')) ||
- (tag.startsWith('&') && (*instr == ';'))) {
- if (tag == "&") {
- out.append('&');
- } else if (tag == "<") {
- out.append('<');
- } else if (tag == ">") {
- out.append('>');
- } else if (tag == """) {
- out.append('"');
- } else if (tag == " ") {
- out.append(' ');
- } else if (tag == "°") {
- out.append("deg");
- } else if (tag.startsWith("
Date: Sun, 15 Sep 2024 07:53:05 -0600
Subject: [PATCH 05/21] strip_html deletes other tags
---
util.cc | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/util.cc b/util.cc
index 35cde0c13..a812ec576 100644
--- a/util.cc
+++ b/util.cc
@@ -785,6 +785,9 @@ QString strip_html(const QString& utfstring)
assert(trre.isValid());
static const QRegularExpression tdre("
", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(tdre.isValid());
+ static const QRegularExpression otherre("<.*?>", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(otherre.isValid());
+
QString out(utfstring);
@@ -794,6 +797,7 @@ QString strip_html(const QString& utfstring)
out.replace(trre, "\n");
out.replace(tdre, " ");
out.replace("
Date: Sun, 15 Sep 2024 08:00:22 -0600
Subject: [PATCH 06/21] fix strip_html img tag handling
---
util.cc | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/util.cc b/util.cc
index a812ec576..832a86205 100644
--- a/util.cc
+++ b/util.cc
@@ -785,6 +785,8 @@ QString strip_html(const QString& utfstring)
assert(trre.isValid());
static const QRegularExpression tdre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(tdre.isValid());
+ static const QRegularExpression imgre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
+ assert(imgre.isValid());
static const QRegularExpression otherre("<.*?>", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(otherre.isValid());
@@ -796,7 +798,7 @@ QString strip_html(const QString& utfstring)
out.replace(brre, "\n");
out.replace(trre, "\n");
out.replace(tdre, " ");
- out.replace("
Date: Sun, 15 Sep 2024 08:19:33 -0600
Subject: [PATCH 07/21] Revert "fix strip_html img tag handling"
This reverts commit b0440f7e3729909fd51ecbbacdd7a7e4aa5d9198.
---
util.cc | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/util.cc b/util.cc
index 832a86205..a812ec576 100644
--- a/util.cc
+++ b/util.cc
@@ -785,8 +785,6 @@ QString strip_html(const QString& utfstring)
assert(trre.isValid());
static const QRegularExpression tdre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(tdre.isValid());
- static const QRegularExpression imgre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
- assert(imgre.isValid());
static const QRegularExpression otherre("<.*?>", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(otherre.isValid());
@@ -798,7 +796,7 @@ QString strip_html(const QString& utfstring)
out.replace(brre, "\n");
out.replace(trre, "\n");
out.replace(tdre, " ");
- out.replace(imgre, "[IMG]");
+ out.replace("
Date: Sun, 15 Sep 2024 08:19:46 -0600
Subject: [PATCH 08/21] Revert "strip_html deletes other tags"
This reverts commit 40fe2ef0153d8c2e6d53432b9184f8acffbab2ef.
---
util.cc | 4 ----
1 file changed, 4 deletions(-)
diff --git a/util.cc b/util.cc
index a812ec576..35cde0c13 100644
--- a/util.cc
+++ b/util.cc
@@ -785,9 +785,6 @@ QString strip_html(const QString& utfstring)
assert(trre.isValid());
static const QRegularExpression tdre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(tdre.isValid());
- static const QRegularExpression otherre("<.*?>", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
- assert(otherre.isValid());
-
QString out(utfstring);
@@ -797,7 +794,6 @@ QString strip_html(const QString& utfstring)
out.replace(trre, "\n");
out.replace(tdre, " ");
out.replace("
Date: Sun, 15 Sep 2024 08:19:50 -0600
Subject: [PATCH 09/21] Revert "use regex for strip_html"
This reverts commit 677da95a77b335bfa686a35fc33e862f8b50f053.
---
util.cc | 80 +++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 55 insertions(+), 25 deletions(-)
diff --git a/util.cc b/util.cc
index 35cde0c13..7aee18cf1 100644
--- a/util.cc
+++ b/util.cc
@@ -777,32 +777,62 @@ QString strip_html(const QString& utfstring)
doc.setHtml(utfstring);
return doc.toPlainText().simplified();
#else
- static const QRegularExpression pre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
- assert(pre.isValid());
- static const QRegularExpression brre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
- assert(brre.isValid());
- static const QRegularExpression trre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
- assert(trre.isValid());
- static const QRegularExpression tdre("", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
- assert(tdre.isValid());
-
- QString out(utfstring);
-
- // Tag replacement first
- out.replace(pre, "\n");
- out.replace(brre, "\n");
- out.replace(trre, "\n");
- out.replace(tdre, " ");
- out.replace(" ");
- out.replace(""", "\"");
- out.replace(" ", " ");
- out.replace("°", "deg");
+ QString tag;
+ bool processing_tag = false;
+ QString out;
+ for (auto instr = utfstring.cbegin(), end = utfstring.cend(); instr != end;) {
+ if ((*instr == '<') || (*instr == '&')) {
+ processing_tag = true;
+ }
+
+ if (!processing_tag) {
+ if (*instr == '\n') {
+ out.append(' ');
+ do {
+ instr++;
+ } while ((instr != end) && instr->isSpace());
+ continue;
+ } else {
+ out.append(*instr);
+ }
+ } else {
+ if (tag.size() < 7) {
+ tag.append(instr->toLower());
+ }
+ }
+
+ if ((tag.startsWith('<') && (*instr == '>')) ||
+ (tag.startsWith('&') && (*instr == ';'))) {
+ if (tag == "&") {
+ out.append('&');
+ } else if (tag == "<") {
+ out.append('<');
+ } else if (tag == ">") {
+ out.append('>');
+ } else if (tag == """) {
+ out.append('"');
+ } else if (tag == " ") {
+ out.append(' ');
+ } else if (tag == "°") {
+ out.append("deg");
+ } else if (tag.startsWith("
Date: Sun, 15 Sep 2024 14:44:16 -0600
Subject: [PATCH 10/21] implement strip_html using
QRegularExpressionMatchIterator.
---
testo | 1 +
util.cc | 88 +++++++++++++++++++++++++++------------------------------
2 files changed, 43 insertions(+), 46 deletions(-)
diff --git a/testo b/testo
index fc5550162..5083eead1 100755
--- a/testo
+++ b/testo
@@ -189,4 +189,5 @@ if [ -z "${VALGRIND}" ]; then
fi
fi
+echo "Total Errors: $errorcount"
exit $errorcount
diff --git a/util.cc b/util.cc
index 7aee18cf1..1a24e9249 100644
--- a/util.cc
+++ b/util.cc
@@ -36,7 +36,9 @@
#include // for QDateTime
#include // for QFileInfo
#include // for QList
-#include // for QRegularExpressio
+#include // for QRegularExpression
+#include // for QRegularExpressionMatch
+#include // for QRegularExpressionMatchIterator
#include // for QString
#include // for QTextBoundaryFinder, QTextBoundaryFinder::Grapheme
#include // for QTextCodec
@@ -777,62 +779,56 @@ QString strip_html(const QString& utfstring)
doc.setHtml(utfstring);
return doc.toPlainText().simplified();
#else
- QString tag;
- bool processing_tag = false;
+ static const QRegularExpression re("(?:<(?.*?)>)|(?:&(?.*?);)|(?[^<&]+)|(?.+)",
+ QRegularExpression::DotMatchesEverythingOption);
+ assert(re.isValid());
+ static const QRegularExpression newlinespace_re("\\n\\s*");
+ assert(newlinespace_re.isValid());
QString out;
- for (auto instr = utfstring.cbegin(), end = utfstring.cend(); instr != end;) {
- if ((*instr == '<') || (*instr == '&')) {
- processing_tag = true;
- }
-
- if (!processing_tag) {
- if (*instr == '\n') {
+ QRegularExpressionMatchIterator it = re.globalMatch(utfstring);
+ while (it.hasNext()) {
+ auto match = it.next();
+ //qDebug() << match.capturedTexts();
+ // TODO: Qt >= 6.3 use match.hasCaptured(...) instead of !match.captured(...).isNull()
+ if (!match.captured(u"tag").isNull()) {
+ QString tag = match.captured(u"tag");
+ //qDebug() << "tag match:" << tag;
+ if (tag.startsWith("p", Qt::CaseInsensitive)) {
+ out.append('\n');
+ } else if (tag.startsWith("br", Qt::CaseInsensitive)) {
+ out.append('\n');
+ } else if (tag.startsWith("/tr", Qt::CaseInsensitive)) {
+ out.append('\n');
+ } else if (tag.startsWith("/td", Qt::CaseInsensitive)) {
out.append(' ');
- do {
- instr++;
- } while ((instr != end) && instr->isSpace());
- continue;
- } else {
- out.append(*instr);
- }
- } else {
- if (tag.size() < 7) {
- tag.append(instr->toLower());
- }
- }
-
- if ((tag.startsWith('<') && (*instr == '>')) ||
- (tag.startsWith('&') && (*instr == ';'))) {
- if (tag == "&") {
+ } else if (tag.startsWith("img", Qt::CaseInsensitive)) {
+ out.append("[IMG]");
+ } // else eat the tag
+ } else if (!match.captured(u"entity").isNull()) {
+ QString entity = match.captured(u"entity");
+ //qDebug() << "entity match:" << entity;
+ if (match.captured() == "amp") {
out.append('&');
- } else if (tag == "<") {
+ } else if (match.captured() == "lt") {
out.append('<');
- } else if (tag == ">") {
+ } else if (match.captured() == "gt") {
out.append('>');
- } else if (tag == """) {
+ } else if (match.captured() == "quot") {
out.append('"');
- } else if (tag == " ") {
+ } else if (match.captured() == "nbsp") {
out.append(' ');
- } else if (tag == "°") {
+ } else if (match.captured() == "deg") {
out.append("deg");
- } else if (tag.startsWith("
Date: Sun, 15 Sep 2024 14:56:30 -0600
Subject: [PATCH 11/21] a little cleanup
---
util.cc | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/util.cc b/util.cc
index 1a24e9249..bb9e4f5e3 100644
--- a/util.cc
+++ b/util.cc
@@ -752,7 +752,7 @@ strip_nastyhtml(const QString& in)
static const QRegularExpression stylere(".*?", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption);
assert(stylere.isValid());
QString out(in);
-
+
out.replace(bodyre, "");
out.replace("