From af73e757783f8d0fb221f15a8602e8d032556fb3 Mon Sep 17 00:00:00 2001 From: Sriram Sami Date: Tue, 21 Jun 2022 17:32:41 +0800 Subject: [PATCH 1/3] Add searching for url in post text, needs testing --- src/containers/ResultCard.tsx | 11 +++++ src/pages/Sidebar/Sidebar.tsx | 85 ++++++++++++++++++++++++++++++++++- src/providers/hackernews.ts | 40 +++++++++++++++++ src/providers/providers.ts | 3 ++ src/providers/reddit.ts | 37 ++++++++++++++- 5 files changed, 173 insertions(+), 3 deletions(-) diff --git a/src/containers/ResultCard.tsx b/src/containers/ResultCard.tsx index 4e90082..e9617ea 100644 --- a/src/containers/ResultCard.tsx +++ b/src/containers/ResultCard.tsx @@ -122,6 +122,17 @@ const ResultCard = ({ EXACT MATCH )} + {resultWithReplacedLink.providerQueryType === + ProviderQueryType.EXACT_URL_TEXT && ( + // data-iscapture="true" allow us to immediately dismiss tooltip on user scroll +
+ EXACT URL TEXT MATCH +
+ )} + {resultWithReplacedLink.subSourceName !== "" && (
{ @@ -192,7 +196,12 @@ const debugResults = (providerData: AllProviderResults | undefined) => { // Split results into the different sources when under debug mode const haveHnExactResults = hnResults[ProviderQueryType.EXACT_URL]?.length > 0; const haveRedditExactResults = + redditResults[ProviderQueryType.EXACT_URL_TEXT]?.length > 0; + const haveHnExactUrlTextResults = + hnResults[ProviderQueryType.EXACT_URL_TEXT]?.length > 0; + const haveRedditExactUrlTextResults = redditResults[ProviderQueryType.EXACT_URL]?.length > 0; + log.debug( `Have HN exact: ${haveHnExactResults}, have Reddit exact: ${haveRedditExactResults}` ); @@ -203,6 +212,8 @@ const debugResults = (providerData: AllProviderResults | undefined) => { return { haveHnExactResults, haveRedditExactResults, + haveHnExactUrlTextResults, + haveRedditExactUrlTextResults, haveHnTitleResults, haveRedditTitleResults, }; @@ -408,6 +419,8 @@ const Sidebar = () => { const { haveHnExactResults, haveRedditExactResults, + haveHnExactUrlTextResults, + haveRedditExactUrlTextResults, haveHnTitleResults, haveRedditTitleResults, } = debugResults(providerData); @@ -648,6 +661,76 @@ const Sidebar = () => {
)} +
+ {haveHnExactUrlTextResults || haveRedditExactUrlTextResults ? ( +
+
+ Results for{" "} + + {`current URL in post text`} + +
+ {" "} + ({searchExactUrl}){" "} +
+
+ {haveHnExactUrlTextResults && ( +
+ +
+
+ )} + {haveRedditExactUrlTextResults && ( +
+ +
+ )} +
+ ) : ( +
+ No results for{" "} + + {`current page title`} + +
+ {" "} + ({searchTitle}){" "} +
+
+ )} +
{haveHnTitleResults || haveRedditTitleResults ? (
diff --git a/src/providers/hackernews.ts b/src/providers/hackernews.ts index 6b9cbeb..2580376 100644 --- a/src/providers/hackernews.ts +++ b/src/providers/hackernews.ts @@ -107,6 +107,46 @@ export class HnResultProvider implements ResultProvider { }; } + async getExactUrlTextResults(url: string): Promise { + const encodedUrl = encodeURIComponent(url); + const queryString = `query=\"${encodedUrl}\"&tags=story&typoTolerance=false`; + const requestUrl = "https://hn.algolia.com/api/v1/search?" + queryString; + const res: HnJsonResult = await cachedApiCall( + requestUrl, + true, + CACHE_URL_DURATION_SEC + ); + if (res.nbHits === 0) { + log.debug("Hacker News API: No urls found"); + return { + providerName: ProviderType.HACKER_NEWS, + queryType: ProviderQueryType.EXACT_URL_TEXT, + results: [], + }; + } + log.debug("HN Results Pre-translation:"); + log.debug(res.hits); + const itemsAll = + res.hits?.map((hnHit) => + translateHnToItem( + hnHit, + ProviderQueryType.EXACT_URL_TEXT, + url, + requestUrl + ) + ) || []; + log.debug("Hacker News returned results for exact url text search:", { + response: res, + resultsWithoutDedup: itemsAll, + resultsTranslated: itemsAll, + }); + return { + providerName: ProviderType.HACKER_NEWS, + queryType: ProviderQueryType.EXACT_URL_TEXT, + results: itemsAll, + }; + } + // Main function to get all relevant results from HN async getSiteUrlResults(url: string): Promise { const encodedUrl = encodeURIComponent(url); diff --git a/src/providers/providers.ts b/src/providers/providers.ts index dd114b2..ca20d45 100644 --- a/src/providers/providers.ts +++ b/src/providers/providers.ts @@ -13,6 +13,7 @@ import { scoreResultsRelevance } from "./scoring"; // All providers must implement these two functions for search export interface ResultProvider { getExactUrlResults(url: string): Promise; + getExactUrlTextResults(url: string): Promise; getSiteUrlResults(url: string): Promise; getTitleResults(url: string, title: string): Promise; getComments(url: string): Promise; @@ -26,6 +27,7 @@ export enum ProviderType { // To indicate inside the result structure, so we know where in the UI to place it export enum ProviderQueryType { EXACT_URL = "exact_url", + EXACT_URL_TEXT = "exact_url_text", SITE_URL = "site_url", TITLE = "title", } @@ -193,6 +195,7 @@ export async function fetchDataFromProviders( const providerPromises: Promise[] = providers .map((provider) => [ provider.getExactUrlResults(cleanedUrl), + provider.getExactUrlTextResults(cleanedUrl), scoreResultsRelevance( documentTitle, provider.getTitleResults(cleanedUrl, documentTitle) diff --git a/src/providers/reddit.ts b/src/providers/reddit.ts index 04622f0..c4b8862 100644 --- a/src/providers/reddit.ts +++ b/src/providers/reddit.ts @@ -24,7 +24,7 @@ const cheerio = require("cheerio"); export class RedditResultProvider implements ResultProvider { // Main function to get all relevant results from Reddit async getExactUrlResults(url: string): Promise { - const queryString = "sort=top&q=" + encodeURIComponent("url:" + url); + const queryString = "sort=top&q=" + encodeURIComponent('url:"' + url + '"'); const requestUrl = "https://old.reddit.com/search?" + queryString; const data = await cachedApiCall(requestUrl, false, CACHE_URL_DURATION_SEC); @@ -39,6 +39,7 @@ export class RedditResultProvider implements ResultProvider { ) ) .toArray(); + // Remove non-exact url matches const itemsDeduped = itemsAll.filter( (item) => (item.submittedUrl.endsWith(url) || @@ -62,6 +63,39 @@ export class RedditResultProvider implements ResultProvider { }; } + // Main function to get all relevant results from Reddit + async getExactUrlTextResults(url: string): Promise { + const queryString = 'sort=relevance&q="' + encodeURIComponent(url) + '"'; + const requestUrl = "https://old.reddit.com/search?" + queryString; + const data = await cachedApiCall(requestUrl, false, CACHE_URL_DURATION_SEC); + + const $ = cheerio.load(data); + const itemsAll: ResultItem[] = $(".search-result.search-result-link") + .map((i: number, el: Element) => + this.translateRedditToItem( + $(el).html(), + ProviderQueryType.EXACT_URL_TEXT, + url, + requestUrl + ) + ) + .toArray(); + + if (itemsAll.length === 0) { + return { + providerName: ProviderType.REDDIT, + queryType: ProviderQueryType.EXACT_URL_TEXT, + results: [], + }; + } + + return { + providerName: ProviderType.REDDIT, + queryType: ProviderQueryType.EXACT_URL_TEXT, + results: itemsAll, + }; + } + async getSiteUrlResults(url: string): Promise { const queryString = "sort=top&q=" + encodeURIComponent("site:" + url); const requestUrl = "https://old.reddit.com/search?" + queryString; @@ -220,7 +254,6 @@ export class RedditResultProvider implements ResultProvider { providerRequestUrl: string ): ResultItem { const $ = cheerio.load(html); - const url = $(".search-link").attr("href"); const commentsText = $(".search-comments").text(); const commentsLink = $(".search-comments").attr("href"); From 65d85a45fb3ffff4189b1bf18c4b945e406a5e02 Mon Sep 17 00:00:00 2001 From: Sriram Sami Date: Tue, 21 Jun 2022 17:32:52 +0800 Subject: [PATCH 2/3] Change ML filter threshold to 0 from -5 --- src/shared/constants.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shared/constants.ts b/src/shared/constants.ts index 0e93a47..39f6770 100644 --- a/src/shared/constants.ts +++ b/src/shared/constants.ts @@ -15,7 +15,7 @@ export const EVENTS_HOST = export const ML_HOST = "https://crowdwise-ml-jhhom.ondigitalocean.app/api/score_documents"; export const ML_API_KEY = "5b58147b-d869-465a-ab43-41c2ffc29ae0"; -export const ML_FILTER_THRESHOLD = -5.0; +export const ML_FILTER_THRESHOLD = 0.0; export const GITHUB_REPOSITORY_LINK = "https://github.com/usecrowdwise/crowdwise"; From b6df30885dd2ed577eb03cf9d39f5d7e1e413bf5 Mon Sep 17 00:00:00 2001 From: Tang Yew Siang Date: Sun, 26 Jun 2022 14:47:45 +0800 Subject: [PATCH 3/3] Update ResultCard.tsx --- src/containers/ResultCard.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/containers/ResultCard.tsx b/src/containers/ResultCard.tsx index e9617ea..0c2c836 100644 --- a/src/containers/ResultCard.tsx +++ b/src/containers/ResultCard.tsx @@ -126,10 +126,10 @@ const ResultCard = ({ ProviderQueryType.EXACT_URL_TEXT && ( // data-iscapture="true" allow us to immediately dismiss tooltip on user scroll
- EXACT URL TEXT MATCH + DISCUSSION CONTAINS LINK
)}