From dd08d816dca127808b2343005ec8a728b6cb2a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20Luka=20=C5=A0ijanec?= Date: Fri, 5 Jan 2024 16:44:26 +0100 Subject: support for suggested queries and query redirects --- src/api.c | 44 ++++++++++++++++++++++++++++++++--- src/httpd.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++----------- src/i18n.h | 2 ++ src/lib.c | 30 +++++++++++++++--------- src/main.c | 1 + src/structs.c | 6 ++++- 6 files changed, 128 insertions(+), 29 deletions(-) (limited to 'src') diff --git a/src/api.c b/src/api.c index a652e3a..b0ef96d 100644 --- a/src/api.c +++ b/src/api.c @@ -127,6 +127,7 @@ int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking enum sc_return sc_query_google (const char * s, /* breaking change: changed return type */ struct sc_cache * c, struct sc_query * q, + char ** redirect, /* variable redirect will be set to a heap allocated string that must be freed by the caller if the upstream returned results for a different query. in that case the returned query object will be for a different search string! -- if NULL, request that upstream does not enable "results for" feature */ SC_OPT_TYPE opt) { /* check4cachedB4 */ /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */ /* if query is not NULL, it MUST be initialized */ @@ -155,6 +156,9 @@ enum sc_return sc_query_google (const char * s, /* breaking change: changed retu htmlDocPtr xmldoc = NULL; char * txtdoc = NULL; int qwasgiven = 0; + SC_LOG(SC_LOG_DEBUG, c, "%s called, redirect is %p", __func__, redirect); + if (redirect) + *redirect = NULL; if (!s || !c) { rs = SC_BADCALL; goto rc; @@ -166,7 +170,7 @@ enum sc_return sc_query_google (const char * s, /* breaking change: changed retu qwasgiven++; char * us = malloc(sizeof(char)*strlen(s)*3+1); urlencode(us, s); - txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100&ie=UTF-8%s", us, (opt&SC_OPT_IMAGE) ? "&tbm=isch" : ""); + txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100&ie=UTF-8%s%s", us, (opt&SC_OPT_IMAGE) ? "&tbm=isch" : "", redirect ? "" : "&nfpr=1"); // fprintf(stdout, "%s\n", txtdoc); free(us); if (!txtdoc) { @@ -178,6 +182,7 @@ enum sc_return sc_query_google (const char * s, /* breaking change: changed retu rs = SC_CAPTCHA; goto rc; } + char * resultsforclass = sc_find_class(txtdoc, "{color:#1967d2}"); if (opt & SC_OPT_IMAGE) { imageclass = sc_find_class(txtdoc, "{font-family:Roboto,Helvetica,Arial,sans-serif}"); if (!imageclass) { @@ -292,11 +297,42 @@ enum sc_return sc_query_google (const char * s, /* breaking change: changed retu SC_CUE(c, c->queries_lock); goto rc; } + q->string = realloc(q->string, sl+1); + strcpy(q->string, s); + char * xpathsugg = NULL; + if (resultsforclass) { + xpathsugg = malloc(512+strlen(resultsforclass)); + sprintf(xpathsugg, "//a[contains(@class, '%s')]", resultsforclass); + xmlNodePtr suggnode = nthNodeX(xmldoc, xpathsugg, 0); + if (suggnode && xmlHasProp(suggnode, BAD_CAST "href")) { + char * href = (char *) xmlGetProp(suggnode, BAD_CAST "href"); + char * content = (char *) xmlNodeGetContent(suggnode); + if (href && strstr(href, "&spell=1&")) + strcpy((q->suggested = realloc(q->suggested, strlen(content)+1)), content); + xmlFree(href); + xmlFree(content); + } else { + free(q->suggested); + q->suggested = NULL; + } + } else { + free(q->suggested); + q->suggested = NULL; + } + xmlNodePtr first = nthNodeX(xmldoc, xpathsugg, 1); + if (redirect && xpathsugg && q->suggested && xmlHasProp(first, BAD_CAST "href")) { + char * href = (char *) xmlGetProp(first, BAD_CAST "href"); + if (href && strstr(href, "&nfpr=1&")) { + *redirect = q->suggested; + q->suggested = NULL; + q->string = realloc(q->string, strlen(*redirect)+1); + strcpy(q->string, *redirect); + } + xmlFree(href); + } q->cache = c; q->lookup_time = time(NULL); - q->string = realloc(q->string, sl+1); q->opt |= opt | SC_ENGINE_GOOGLE; - strcpy(q->string, s); if (!qwasgiven) { SC_CWLE(c, c->queries_lock); #ifdef SC_OLD_STORAGE @@ -318,6 +354,8 @@ rc: free(titleclass); free(descclass); free(imageclass); + free(resultsforclass); free(xpath); + free(xpathsugg); return rs; } diff --git a/src/httpd.c b/src/httpd.c index 0171a11..c200fc6 100644 --- a/src/httpd.c +++ b/src/httpd.c @@ -3,7 +3,7 @@ char * sc_https2http (char * i) { memmove(i+4, i+5, strlen(i)-3); return i; } -char * sc_queryhtml (const struct sc_query * q, const char * add_form, size_t l) { /* remember to free returned string in the caller */ /* caller takes care of freeing */ +char * sc_queryhtml (const struct sc_query * q, const char * add_form, size_t l, const char * r) { /* remember to free returned string in the caller */ /* caller takes care of freeing */ size_t resultshtml_written = 0; size_t resultshtml_sizeof = SC_ALLOC_CHUNK; char * resultshtml = malloc(resultshtml_sizeof); @@ -39,16 +39,48 @@ char * sc_queryhtml (const struct sc_query * q, const char * add_form, size_t l) free(safebody); free(safeurl); } -#define SC_HRS SC_I18N_NUMBER_OF_RESULTS ": %zu | " SC_I18N_QUERY_TIME ": %s" char formatted_time[128]; struct tm tm; localtime_r(&q->lookup_time, &tm); strftime(formatted_time, 128, SC_I18N_DATETIME_FORMAT, &tm); - char queryinfo[256]; - snprintf(queryinfo, 256, SC_HRS, q->results_length, formatted_time); + char * safesuggested = NULL; + if (q->suggested && strlen(q->suggested) < 4096) { + safesuggested = alloca(strlen(q->suggested)*3+256); + strcpy(safesuggested, "?q="); + urlencode(safesuggested+3, q->suggested); + } + if (!q->suggested && r && strlen(r) < 4096) { + safesuggested = alloca(strlen(r)*3+256); + strcpy(safesuggested, "?q="); + urlencode(safesuggested+3, r); + } + char * htmlsuggested = htmlspecialchars(q->suggested); + if (!htmlsuggested) + htmlsuggested = htmlspecialchars(r); + if (safesuggested) { + if (strstr(add_form, "name=h")) + strcat(safesuggested, "&h=h"); + if (strstr(add_form, "name=l")) + sprintf(safesuggested+strlen(safesuggested), "&l=%d", atoi(strstr(add_form, "name=l")+8)); + if (strstr(add_form, "name=h")) + strcat(safesuggested, "&h=h"); + if (strstr(add_form, "name=e") || r) + strcat(safesuggested, "&e=e"); + } + char * suggested = NULL; + if (htmlsuggested && safesuggested) + suggested = malloc(1+strlen(SC_I18N_DID_YOU_REALLY_MEAN)+strlen(SC_I18N_DID_YOU_MEAN)+strlen(safesuggested)+strlen(htmlsuggested)); + if (suggested) + sprintf(suggested, "%s %s", q->suggested ? SC_I18N_DID_YOU_MEAN : SC_I18N_DID_YOU_REALLY_MEAN, safesuggested, htmlsuggested); + char * queryinfo = malloc(256+strlen(suggested ? suggested : "")); + snprintf(queryinfo, 256, "%s%s" SC_I18N_NUMBER_OF_RESULTS ": %zu | " SC_I18N_QUERY_TIME ": %s" +, suggested ? suggested : "", suggested ? " | " : "", q->results_length, formatted_time); char * safequery = htmlspecialchars(q->string); char * response = malloc(strlen((char *) sc_hp)+2*strlen(safequery)+strlen(queryinfo)+strlen(resultshtml)+strlen(add_form)); sprintf(response, (char *) sc_hp, safequery, safequery, add_form, queryinfo, resultshtml); + free(queryinfo); + free(suggested); + free(htmlsuggested); free(safequery); free(resultshtml); return response; @@ -130,6 +162,8 @@ enum MHD_Result sc_httpd (void * cls, const char * l = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "l"); const char * h = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "h"); const char * f = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "f"); + const char * e = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "e"); + const char * r = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "r"); snprintf(add_form, 128, "%s%s%d%s", h ? "" : "", l ? "queries_lock); } else { SC_CUE(c, c->queries_lock); - enum sc_return r = sc_query_google(query, c, NULL, opt); + char * redirect = NULL; + enum sc_return r = sc_query_google(query, c, NULL, e ? NULL : &redirect, opt); + if (redirect && strlen(query) < 4096 && strlen(redirect) < 4096) { + status_code = 307; + location = alloca(256+strlen(query)*3+strlen(redirect)*3); + sprintf(location, "?l=%d%s%s%s%s&q=", atoi(l ? l : ""), (opt & SC_OPT_IMAGE) ? "&i=i" : "", h ? "&h=h" : "", f ? "&f=f" : "", e ? "&e=e" : ""); + urlencode(location+strlen(location), redirect); + free(redirect); + redirect = NULL; + strcat(location, "&r="); + urlencode(location+strlen(location), query); + content_type = "text/plain"; + char * safeurl = htmlspecialchars(location); + free(response); + response = malloc(strlen(safeurl)*3+512); + sprintf(response, "%s", safeurl, safeurl, safeurl); + free(safeurl); + goto sendresp; + } if (already_retried++ || r == SC_CAPTCHA) { status_code = 570+ABS(r); if (r == SC_CAPTCHA && strlen(query) < 4096) { @@ -224,15 +276,8 @@ retry: status_code = 307; location = alloca(strlen(getenv("SC_FALLBACK")) + 256 + strlen(query)*3); - sprintf(location, "%sl=%d&q=", getenv("SC_FALLBACK"), - atoi(l ? l : "")); + sprintf(location, "%sl=%d%s%s%s%s&q=", getenv("SC_FALLBACK"), atoi(l ? l : ""), (opt & SC_OPT_IMAGE) ? "&i=i" : "", h ? "&h=h" : "", f ? "&f=f" : "", e ? "&e=e" : ""); urlencode(location+strlen(location), query); - if (opt & SC_OPT_IMAGE) - strcat(location, "&i=i"); - if (h) - strcat(location, "&h=h"); - if (f) - strcat(location, "&f=f"); } char * safequery = htmlspecialchars(query); response = malloc(strlen((char*) sc_hp) @@ -258,6 +303,7 @@ retry: } else goto retry; } } + sendresp: httpd_response = MHD_create_response_from_buffer (response_len ? response_len : strlen(response), (void *) response, mhdrmm); MHD_add_response_header(httpd_response, "Content-Type", content_type); if (status_code >= 300 && status_code <= 399) diff --git a/src/i18n.h b/src/i18n.h index d06e4f0..86d7b21 100644 --- a/src/i18n.h +++ b/src/i18n.h @@ -35,3 +35,5 @@ #define SC_I18N_SEARCH "išči" #define SC_I18N_HORSESHOE "hitro" #define SC_I18N_IMAGES "slike" +#define SC_I18N_DID_YOU_REALLY_MEAN "preusmeril sem vas iz:" +#define SC_I18N_DID_YOU_MEAN "predlagam iskanje:" diff --git a/src/lib.c b/src/lib.c index 2377ec9..62ddf84 100644 --- a/src/lib.c +++ b/src/lib.c @@ -46,18 +46,23 @@ void eachNodeX (htmlDocPtr doc, const char * xpath, node_function_t f, void * da eachNode(nodes, f, data); xmlXPathFreeObject(nodes); } -xmlNodePtr nthNodeXN (xmlNodePtr node, const char * xpath, int n) { - xmlXPathObjectPtr nodes = findNodesN(node, xpath); - if (!nodes) - return NULL; - xmlNodeSetPtr nodeset = nodes->nodesetval; - int size = nodeset->nodeNr; - if (size <= n) - return NULL; - xmlNodePtr toreturn = (xmlNodePtr) nodeset->nodeTab[n]; - xmlXPathFreeObject(nodes); - return toreturn; +#define nthNodeFunctionGenerator(type, x) \ +xmlNodePtr nthNodeX##x (type node, const char * xpath, int n) { \ + xmlXPathObjectPtr nodes = findNodes##x(node, xpath); \ + if (!nodes) \ + return NULL; \ + xmlNodeSetPtr nodeset = nodes->nodesetval; \ + int size = nodeset->nodeNr; \ + if (size <= n) { \ + xmlXPathFreeObject(nodes); \ + return NULL; \ + } \ + xmlNodePtr toreturn = (xmlNodePtr) nodeset->nodeTab[n]; \ + xmlXPathFreeObject(nodes); \ + return toreturn; \ } +nthNodeFunctionGenerator(htmlDocPtr,) // this one gets doc +nthNodeFunctionGenerator(xmlNodePtr, N) #define EACHNODE(node, nodes) /* you can instead use eachNodeX with anonymous function - no need to free and findnodes separatl */ \ for (int EACHNODE_i = 0; \ nodes ? nodes->nodesetval ? \ @@ -110,6 +115,9 @@ char * htmlspecialchars (const char * i) { /* remember to free the output */ case '"': w += sprintf(o+w, """); break; + case '\'': + w += sprintf(o+w, "'"); + break; default: o[w++] = *i; break; diff --git a/src/main.c b/src/main.c index fdab429..a9857b6 100644 --- a/src/main.c +++ b/src/main.c @@ -1,4 +1,5 @@ #define _GNU_SOURCE +#include #include #include #include diff --git a/src/structs.c b/src/structs.c index 6deea3e..f23e52f 100644 --- a/src/structs.c +++ b/src/structs.c @@ -84,6 +84,8 @@ struct sc_query { SC_IN_STRUCT_ARRAY(struct sc_result, results); /* yesfree */ char * string; /* yesfree - query string, stripped of any excess characters that should be excluded from indexing */ time_t lookup_time; /* time of last lookup */ + char * suggested; /* yesfree - suggested search query (did you mean) */ + bool redirect; /* true if client is encouraged to be redirected to suggested (showing results for) */ SC_OPT_TYPE opt; /* some options including engines */ }; struct sc_query * sc_query_init () { @@ -95,6 +97,7 @@ struct sc_query * sc_query_init () { q->results[i]->query = q; } q->string = NULL; + q->suggested = NULL; q->opt = SC_OPT_INIT; return q; } @@ -122,8 +125,9 @@ sc_query_free ( #endif ; if (q->cache) - SC_LOG(SC_LOG_DEBUG, q->cache, "sc_query_free: %s", q->string ? q->string : "NULL"); + SC_LOG(SC_LOG_DEBUG, q->cache, "sc_query_free: %s (sugg: %s)", q->string ? q->string : "NULL", q->suggested ? q->suggested : "NULL"); free(q->string); /* if they were not alloced, they are NULL, if they were free'd somewhere else, they are also set to NULL */ + free(q->suggested); for (size_t i = 0; i < q->results_sizeof; i++) sc_result_free(q->results[i]); free(q->results); -- cgit v1.2.3