@@ -20,8 +20,51 @@ const MAX_CONTENT_LENGTH = 8000;
2020const stats = { total : 0 , filtered : 0 , injected : 0 , errors : 0 , batched : 0 } ;
2121let batch = [ ] ;
2222
23+ // Default domain list: 60+ medical + CS domains
24+ const DEFAULT_DOMAINS = [
25+ // Medical - Major Publishers & Journals
26+ 'pubmed.ncbi.nlm.nih.gov' , 'ncbi.nlm.nih.gov' , 'who.int' ,
27+ 'nature.com' , 'nejm.org' , 'bmj.com' , 'thelancet.com' ,
28+ 'jamanetwork.com' , 'annals.org' , 'sciencedirect.com' ,
29+ // Medical - Clinical Resources
30+ 'mayoclinic.org' , 'clevelandclinic.org' , 'medlineplus.gov' ,
31+ 'cdc.gov' , 'nih.gov' , 'webmd.com' , 'healthline.com' ,
32+ 'medscape.com' , 'uptodate.com' ,
33+ // Medical - Oncology & Dermatology
34+ 'cancer.org' , 'aad.org' , 'dermnetnz.org' , 'melanoma.org' ,
35+ 'asco.org' , 'esmo.org' , 'nccn.org' , 'cancer.net' ,
36+ 'mskcc.org' , 'mdanderson.org' , 'dana-farber.org' ,
37+ 'dermcoll.edu.au' , 'bad.org.uk' , 'euroderm.org' ,
38+ 'jaad.org' , 'jidonline.org' ,
39+ // Medical - Publishers & Open Access
40+ 'wiley.com' , 'onlinelibrary.wiley.com' , 'springer.com' ,
41+ 'karger.com' , 'thieme.com' , 'mdpi.com' , 'frontiersin.org' ,
42+ 'plos.org' , 'biomedcentral.com' , 'cell.com' , 'elsevier.com' ,
43+ // Medical - Regulatory & Evidence
44+ 'clinicaltrials.gov' , 'fda.gov' , 'ema.europa.eu' ,
45+ 'nice.org.uk' , 'cochrane.org' ,
46+ 'hopkinsmedicine.org' , 'stanfordmedicine.org' ,
47+ // CS - Conferences & Journals
48+ 'arxiv.org' , 'acm.org' , 'dl.acm.org' , 'ieee.org' ,
49+ 'ieeexplore.ieee.org' , 'proceedings.neurips.cc' ,
50+ 'aclanthology.org' , 'jmlr.org' , 'aaai.org' , 'ijcai.org' ,
51+ 'usenix.org' , 'vldb.org' , 'sigmod.org' , 'icml.cc' ,
52+ 'cvpr.thecvf.com' , 'eccv.ecva.net' , 'iccv.thecvf.com' ,
53+ 'openreview.net' , 'paperswithcode.com' ,
54+ // CS - Frameworks & Tools
55+ 'huggingface.co' , 'pytorch.org' , 'tensorflow.org' ,
56+ 'wandb.ai' , 'mlflow.org' , 'ray.io' ,
57+ 'dmlc.cs.washington.edu' ,
58+ // CS - Research Labs & Universities
59+ 'cs.stanford.edu' , 'cs.berkeley.edu' , 'cs.cmu.edu' ,
60+ 'cs.mit.edu' , 'deepmind.google' , 'ai.meta.com' ,
61+ 'research.google' , 'microsoft.com/research' ,
62+ 'blog.openai.com' , 'anthropic.com' ,
63+ ] ;
64+
2365function matchesDomain ( url ) {
24- return DOMAINS . some ( d => url . includes ( d ) ) ;
66+ const allDomains = DOMAINS . length > 0 ? DOMAINS : DEFAULT_DOMAINS ;
67+ return allDomains . some ( d => url . includes ( d ) ) ;
2568}
2669
2770function extractTitle ( content ) {
@@ -38,12 +81,36 @@ function generateTags(url, content) {
3881 if ( url . includes ( 'pubmed' ) || url . includes ( 'ncbi' ) ) tags . push ( 'pubmed' , 'medical' ) ;
3982 else if ( url . includes ( 'arxiv' ) ) tags . push ( 'arxiv' , 'research' ) ;
4083 else if ( url . includes ( 'who.int' ) ) tags . push ( 'who' , 'global-health' ) ;
41- else if ( url . includes ( 'cancer.org' ) ) tags . push ( 'cancer' , 'oncology' ) ;
42- else if ( url . includes ( 'dermnetnz' ) || url . includes ( 'aad.org' ) ) tags . push ( 'dermatology' ) ;
84+ else if ( url . includes ( 'cancer.org' ) || url . includes ( 'cancer.net' ) || url . includes ( 'nccn.org' ) ) tags . push ( 'cancer' , 'oncology' ) ;
85+ else if ( url . includes ( 'asco.org' ) || url . includes ( 'esmo.org' ) ) tags . push ( 'oncology' , 'clinical' ) ;
86+ else if ( url . includes ( 'mskcc.org' ) || url . includes ( 'mdanderson.org' ) || url . includes ( 'dana-farber.org' ) ) tags . push ( 'oncology' , 'research' ) ;
87+ else if ( url . includes ( 'dermnetnz' ) || url . includes ( 'aad.org' ) || url . includes ( 'jaad.org' ) ) tags . push ( 'dermatology' ) ;
88+ else if ( url . includes ( 'dermcoll' ) || url . includes ( 'bad.org.uk' ) || url . includes ( 'euroderm' ) ) tags . push ( 'dermatology' ) ;
89+ else if ( url . includes ( 'jidonline' ) ) tags . push ( 'dermatology' , 'research' ) ;
4390 else if ( url . includes ( 'melanoma' ) ) tags . push ( 'melanoma' , 'skin-cancer' ) ;
44- else if ( url . includes ( 'acm.org' ) || url . includes ( 'ieee' ) ) tags . push ( 'computer-science' ) ;
91+ else if ( url . includes ( 'clinicaltrials.gov' ) ) tags . push ( 'clinical-trials' , 'medical' ) ;
92+ else if ( url . includes ( 'fda.gov' ) || url . includes ( 'ema.europa.eu' ) ) tags . push ( 'regulatory' , 'medical' ) ;
93+ else if ( url . includes ( 'nice.org.uk' ) || url . includes ( 'cochrane.org' ) ) tags . push ( 'evidence-based' , 'medical' ) ;
94+ else if ( url . includes ( 'hopkinsmedicine' ) || url . includes ( 'stanfordmedicine' ) ) tags . push ( 'medical' , 'academic' ) ;
95+ else if ( url . includes ( 'webmd' ) || url . includes ( 'healthline' ) || url . includes ( 'medscape' ) ) tags . push ( 'medical' , 'clinical' ) ;
96+ else if ( url . includes ( 'uptodate.com' ) ) tags . push ( 'medical' , 'clinical-decision' ) ;
97+ else if ( url . includes ( 'acm.org' ) || url . includes ( 'ieee' ) || url . includes ( 'dl.acm.org' ) ) tags . push ( 'computer-science' ) ;
98+ else if ( url . includes ( 'neurips' ) || url . includes ( 'icml' ) || url . includes ( 'aaai.org' ) ) tags . push ( 'ml' , 'conference' ) ;
99+ else if ( url . includes ( 'cvpr' ) || url . includes ( 'eccv' ) || url . includes ( 'iccv' ) ) tags . push ( 'computer-vision' , 'conference' ) ;
100+ else if ( url . includes ( 'aclanthology' ) ) tags . push ( 'nlp' , 'conference' ) ;
101+ else if ( url . includes ( 'usenix' ) || url . includes ( 'vldb' ) || url . includes ( 'sigmod' ) ) tags . push ( 'systems' , 'conference' ) ;
102+ else if ( url . includes ( 'huggingface' ) || url . includes ( 'pytorch' ) || url . includes ( 'tensorflow' ) ) tags . push ( 'ml' , 'framework' ) ;
103+ else if ( url . includes ( 'deepmind' ) || url . includes ( 'ai.meta' ) || url . includes ( 'research.google' ) ) tags . push ( 'ml' , 'research-lab' ) ;
104+ else if ( url . includes ( 'openai' ) || url . includes ( 'anthropic' ) ) tags . push ( 'ml' , 'research-lab' ) ;
105+ else if ( url . includes ( 'cs.stanford' ) || url . includes ( 'cs.berkeley' ) || url . includes ( 'cs.cmu' ) || url . includes ( 'cs.mit' ) ) tags . push ( 'computer-science' , 'academic' ) ;
106+ else if ( url . includes ( 'openreview' ) || url . includes ( 'paperswithcode' ) ) tags . push ( 'ml' , 'research' ) ;
45107 else if ( url . includes ( 'github' ) || url . includes ( 'stackoverflow' ) ) tags . push ( 'programming' ) ;
46108 else if ( url . includes ( 'nature.com' ) || url . includes ( 'nejm' ) || url . includes ( 'lancet' ) ) tags . push ( 'journal' , 'research' ) ;
109+ else if ( url . includes ( 'jamanetwork' ) || url . includes ( 'annals.org' ) || url . includes ( 'bmj.com' ) ) tags . push ( 'journal' , 'medical' ) ;
110+ else if ( url . includes ( 'frontiersin' ) || url . includes ( 'plos.org' ) || url . includes ( 'biomedcentral' ) ) tags . push ( 'open-access' , 'research' ) ;
111+ else if ( url . includes ( 'cell.com' ) || url . includes ( 'elsevier' ) || url . includes ( 'springer' ) || url . includes ( 'wiley' ) ) tags . push ( 'journal' , 'publisher' ) ;
112+ else if ( url . includes ( 'mdpi.com' ) || url . includes ( 'karger' ) || url . includes ( 'thieme' ) ) tags . push ( 'journal' , 'publisher' ) ;
113+ else if ( url . includes ( 'jmlr.org' ) || url . includes ( 'ijcai.org' ) ) tags . push ( 'ml' , 'journal' ) ;
47114
48115 const lower = content . toLowerCase ( ) ;
49116 if ( lower . includes ( 'melanoma' ) ) tags . push ( 'melanoma' ) ;
0 commit comments