commit ea6b1bf2d9dde070915090a352f07f2041ac9478
parent ec4d1fa0668d0cdd65c95ec5b1d1f4edfda9b413
Author: ukai <ukai>
Date: Thu, 22 Nov 2001 14:15:19 +0000
[w3m-dev 02503]
From: aito@fw.ipsj.or.jp
closes: Debian Bug#120540
Diffstat:
2 files changed, 23 insertions(+), 0 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,9 @@
+2001-11-22 aito@fw.ipsj.or.jp
+
+ * [w3m-dev 02503]
+ * indep.c (getescapechar): allow incomplete entity references in URL
+ closes: Debian Bug#120540
+
2001-11-22 Fumitoshi UKAI <ukai@debian.or.jp>
* [w3m-dev 02506]
diff --git a/indep.c b/indep.c
@@ -275,6 +275,7 @@ getescapechar(char **str)
{
int dummy = -1;
char *p = *str, *q;
+ int strict_entity = TRUE;
if (*p == '&')
p++;
@@ -319,8 +320,24 @@ getescapechar(char **str)
for (p++; IS_ALNUM(*p); p++)
;
q = allocStr(q, p - q);
+ if (strcasestr("lt gt amp quot nbsp",q) &&
+ *p != '=') {
+ /* a character entity MUST be terminated with ";". However,
+ there's MANY web pages which uses < , > or something
+ like them as <, >, etc. Therefore, we treat the most
+ popular character entities (including &#xxxx;) without
+ the last ";" as character entities. If the trailing character
+ is "=", it must be a part of query in an URL. So <=, >=, etc.
+ are not regarded as character entities.
+ */
+ strict_entity = FALSE;
+ }
if (*p == ';')
p++;
+ else if (strict_entity) {
+ *str = p;
+ return -1;
+ }
*str = p;
return getHash_si(&entity, q, -1);
}