Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Problem_1/Output_TotalWordCount_image.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
51 changes: 51 additions & 0 deletions Problem_1/Problem1_testcase.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Test case 1. If words.txt file is not present

Output: words.txt file not found.


Test case 2. If urls.txt file is not present

Output: urls.txt file not found


Test case3: Both file words.txt file found and urls.txt file is present

Input: words.txt

to
the
and
a
PAY
US
BLOG
OFBiz
Tutorials

Input :urls.txt

https://www.hotwaxsystems.com/
https://www.geeksforgeeks.org/


Output:

=================================================
https://www.hotwaxsystems.com/
and->21
to->19
OFBiz->15
https://www.geeksforgeeks.org/
and->15
to->7
a->1
=================================================
and->36
to->26
OFBiz->15
a->9
US->6
BLOG->3
Tutorials->3
PAY->2
the ->0
89 changes: 89 additions & 0 deletions Problem_1/WebScrapper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// importing required dependencies
import axios from 'axios';
import cheerio from 'cheerio';
import fs from 'fs';

/* scrapData function that reads reads both the file and count the words present in words.txt form url
present in url.txt and give the top 3 words from each urls and print the total count of words written in word.txt
*/
const scrapData=async()=>{

// read URL from url.txt file and gives error if file not present
fs.readFile('urls.txt', 'utf8', async (err, urldata) => {
if (err) {
console.error("url.txt file not found");
return;
}
//map to store words with their counts
const word=new Map();
//set to store list of urls
const urls=new Set();
//splitting urls in form of an array
const urlFile = urldata.split(/\r\n/);
urlFile.forEach((element) => {
urls.add(element);
});

// read words from words.txt file and gives error if file not present
fs.readFile('words.txt', 'utf-8', async (err, worddata) => {
if (err) {
console.error("Word.txt file not found");
return;
}

//splitting the words in form of array of strings
const words = worddata.split(/\r\n/);
for(const element of words){
word.set(element,0);
}
const fetchOldOWords =new Map(word);
// Use Axios to fetch the content of the webpage
console.log("=================================================");
for(const url of urls){
const wordCount = new Map(fetchOldOWords);
console.log(url);
const response= await axios.get(url);
// Load the HTML content into Cheerio for parsing
const $ = cheerio.load(response.data);
// to remove the scripts form the parsed data
$("script").remove();
$("noscript").remove();

// Extract the text content of the webpage Split the text into an array of words
let data=$("body").text().split(/\s+/);

//checking the words in scrapped data and count them
const scrappeddata = data;
for(const element of scrappeddata){
if(word.has(element)){
word.set(element,word.get(element)+1);
wordCount.set(element,wordCount.get(element)+1);
}
}
// sorted the words present in particular url according to their counts
const sortWords=new Map([...wordCount.entries()].sort((a,b)=>{
return b[1]-a[1];
}))
let count=0;
for(const sortWord of sortWords){
if(count >2){
break;
}
console.log(sortWord[0]+"->"+sortWord[1]);
count++;
}
}
console.log("=================================================");
const sortTotal = new Map([...word.entries()].sort((a,b)=>{
return b[1]-a[1];
}))
// sorted the total words present in all urls according to their counts
for(const sortWord of sortTotal){
console.log(sortWord[0]+"->"+sortWord[1]);
}
});
});

}
//calling scrapData function
scrapData();
Binary file added Problem_1/output_image.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 17 additions & 0 deletions Problem_1/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "kunal-module-4",
"version": "1.0.0",
"type": "module",
"main": "index.js",
"scripts": {
"start": "node index.js"
},
"keywords": [],
"author": "",
"license": "ISC",
"description": "",
"dependencies": {
"axios": "^1.3.4",
"cheerio": "^1.0.0-rc.12"
}
}
2 changes: 2 additions & 0 deletions Problem_1/urls.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
https://www.hotwaxsystems.com/
https://www.geeksforgeeks.org/
9 changes: 9 additions & 0 deletions Problem_1/words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
to
the
and
a
PAY
US
BLOG
OFBiz
Tutorials
Loading