Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 293 additions & 0 deletions 2F6NCTHH5/note.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
{
"paragraphs": [
{
"text": "%md\n\n\nSet spark default conf \n\nspark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.7,postgresql:postgresql:9.1-901-1.jdbc4,org.apache.hadoop:hadoop-aws:2.7.3,com.amazonaws:aws-java-sdk:1.7.4,org.mongodb.spark:mongo-spark-connector_2.11:2.3.0,org.elasticsearch:elasticsearch-hadoop:6.7.0,joda-time:joda-time:2.9.1,org.apache.spark:spark-streaming-twitter_2.10:1.4.1,org.twitter4j:twitter4j-core:3.0.3,org.twitter4j:twitter4j-media-support:3.0.3,org.twitter4j:twitter4j-async:3.0.3,org.twitter4j:twitter4j-examples:3.0.3,org.twitter4j:twitter4j-stream:3.0.3\n\nOR \n\n%dep\n\n/*\n\nBEFORE START\n\nif your are using Hortonworks Sandbox, make sure you added more cores and more memory to YARN\n\nspark streaming requires at least 3 containers to run\n\n\n*/\n\n/* this step must be executed as first command, if you already executed other commands, please click in \"Interpreter\" and restart spark interpreter */\n\nz.reset()\nz.load(\"org.apache.spark:spark-streaming-twitter_2.10:1.4.1\")\nz.load(\"org.twitter4j:twitter4j-core:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-media-support:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-async:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-examples:3.0.3\")\nz.load(\"org.twitter4j:twitter4j-stream:3.0.3\")",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:42:33.049",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "markdown",
"editOnDblClick": true,
"completionKey": "TAB",
"completionSupport": false
},
"editorMode": "ace/mode/markdown",
"editorHide": true,
"tableHide": false
},
"settings": {
"params": {},
"forms": {}
},
"results": {
"code": "SUCCESS",
"msg": [
{
"type": "HTML",
"data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eSet spark default conf \u003c/p\u003e\n\u003cp\u003espark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.7,postgresql:postgresql:9.1-901-1.jdbc4,org.apache.hadoop:hadoop-aws:2.7.3,com.amazonaws:aws-java-sdk:1.7.4,org.mongodb.spark:mongo-spark-connector_2.11:2.3.0,org.elasticsearch:elasticsearch-hadoop:6.7.0,joda-time:joda-time:2.9.1,org.apache.spark:spark-streaming-twitter_2.10:1.4.1,org.twitter4j:twitter4j-core:3.0.3,org.twitter4j:twitter4j-media-support:3.0.3,org.twitter4j:twitter4j-async:3.0.3,org.twitter4j:twitter4j-examples:3.0.3,org.twitter4j:twitter4j-stream:3.0.3\u003c/p\u003e\n\u003cp\u003eOR \u003c/p\u003e\n\u003cp\u003e%dep\u003c/p\u003e\n\u003cp\u003e/*\u003c/p\u003e\n\u003cp\u003eBEFORE START\u003c/p\u003e\n\u003cp\u003eif your are using Hortonworks Sandbox, make sure you added more cores and more memory to YARN\u003c/p\u003e\n\u003cp\u003espark streaming requires at least 3 containers to run\u003c/p\u003e\n\u003cp\u003e*/\u003c/p\u003e\n\u003cp\u003e/* this step must be executed as first command, if you already executed other commands, please click in \u0026ldquo;Interpreter\u0026rdquo; and restart spark interpreter */\u003c/p\u003e\n\u003cp\u003ez.reset()\u003cbr/\u003ez.load(\u0026ldquo;org.apache.spark:spark-streaming-twitter_2.10:1.4.1\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-core:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-media-support:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-async:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-examples:3.0.3\u0026rdquo;)\u003cbr/\u003ez.load(\u0026ldquo;org.twitter4j:twitter4j-stream:3.0.3\u0026rdquo;)\u003c/p\u003e\n\u003c/div\u003e"
}
]
},
"apps": [],
"jobName": "paragraph_1587050985101_-1918385824",
"id": "20200416-152945_597074387",
"dateCreated": "2020-04-16 15:29:45.102",
"dateStarted": "2020-04-16 15:42:33.078",
"dateFinished": "2020-04-16 15:42:33.100",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "%spark\n/*\nUPDATE YOUR TWITTER CREDENTIALS\n*/\n\n\nimport org.apache.spark.streaming._\nimport org.apache.spark.streaming.twitter._\nimport org.apache.spark.storage.StorageLevel\nimport scala.io.Source\nimport scala.collection.mutable.HashMap\nimport java.io.File\nimport org.apache.log4j.Logger\nimport org.apache.log4j.Level\nimport sys.process.stringSeqToProcess\nimport twitter4j.HashtagEntity\n\n/** Configures the Oauth Credentials for accessing Twitter */\ndef configureTwitterCredentials(apiKey: String, apiSecret: String, accessToken: String, accessTokenSecret: String) {\n val configs \u003d new HashMap[String, String] ++\u003d Seq(\n \"apiKey\" -\u003e apiKey, \"apiSecret\" -\u003e apiSecret, \"accessToken\" -\u003e accessToken, \"accessTokenSecret\" -\u003e accessTokenSecret)\n println(\"Configuring Twitter OAuth\")\n configs.foreach{ case(key, value) \u003d\u003e\n if (value.trim.isEmpty) {\n throw new Exception(\"Error setting authentication - value for \" + key + \" not set\")\n }\n val fullKey \u003d \"twitter4j.oauth.\" + key.replace(\"api\", \"consumer\")\n System.setProperty(fullKey, value.trim)\n println(\"\\tProperty \" + fullKey + \" set as [\" + value.trim + \"]\")\n }\n println()\n}\n\n// Configure Twitter credentials\nval apiKey \u003d \"\"\nval apiSecret \u003d \"\"\nval accessToken \u003d \"\"\nval accessTokenSecret \u003d \"\"\nconfigureTwitterCredentials(apiKey, apiSecret, accessToken, accessTokenSecret)\n\nimport org.apache.spark.streaming.twitter._\nval ssc \u003d new StreamingContext(sc, Seconds(2))\nval tweets \u003d TwitterUtils.createStream(ssc, None, Array(\"#hadoop\", \"#bigdata\", \"#spark\", \"#hortonworks\", \"#HDP\"))\n//tweets.saveAsObjectFiles(\"hdfs://sandbox.hortonworks.com:8020/test/twitter-spark/twitter_\", \".txt\")\n\nval twt \u003d tweets.window(Seconds(600))\n\ncase class Tweet(createdAt:Long, text:String, screenName:String)\ntwt.map(status\u003d\u003e\n Tweet(status.getCreatedAt().getTime()/1000, status.getText(), status.getUser().getScreenName())\n).foreachRDD(rdd\u003d\u003e\n rdd.toDF().registerTempTable(\"tweets\")\n)\n\n//twt.print\n\nssc.start()\n\n",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:42:33.174",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "scala",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"results": {
"code": "ERROR",
"msg": [
{
"type": "TEXT",
"data": "\u003cconsole\u003e:19: error: object twitter is not a member of package org.apache.spark.streaming\n import org.apache.spark.streaming.twitter._\n ^\n\u003cconsole\u003e:52: error: object twitter is not a member of package org.apache.spark.streaming\n import org.apache.spark.streaming.twitter._\n ^\n\u003cconsole\u003e:27: error: not found: value twitter4j\n import twitter4j.HashtagEntity\n ^\n\u003cconsole\u003e:54: error: not found: value TwitterUtils\n val tweets \u003d TwitterUtils.createStream(ssc, None, Array(\"#hadoop\", \"#bigdata\", \"#spark\", \"#hortonworks\", \"#HDP\"))\n ^\n"
}
]
},
"apps": [],
"jobName": "paragraph_1587051544840_-284142077",
"id": "20200416-153904_1319192733",
"dateCreated": "2020-04-16 15:39:04.840",
"dateStarted": "2020-04-16 15:42:33.198",
"dateFinished": "2020-04-16 15:42:33.592",
"status": "ERROR",
"progressUpdateIntervalMs": 500
},
{
"text": "%sql\n-- checking window contents\n\nselect from_unixtime(createdAt), count(1) from tweets group by createdAt order by createdAt\n",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:40:30.550",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "sql",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/sql"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051613440_1804273809",
"id": "20200416-154013_1555972031",
"dateCreated": "2020-04-16 15:40:13.440",
"status": "READY",
"progressUpdateIntervalMs": 500
},
{
"text": "%spark\n/* declaring a function in Scala */\n\ndef sentiment(s:String) : String \u003d {\n val positive \u003d Array(\"like\", \"love\", \"good\", \"great\", \"happy\", \"cool\", \"the\", \"one\", \"that\")\n val negative \u003d Array(\"hate\", \"bad\", \"stupid\", \"is\")\n\n var st \u003d 0;\n\n val words \u003d s.split(\" \") \n positive.foreach(p \u003d\u003e\n words.foreach(w \u003d\u003e\n if(p\u003d\u003dw) st \u003d st+1\n )\n )\n\n negative.foreach(p\u003d\u003e\n words.foreach(w\u003d\u003e\n if(p\u003d\u003dw) st \u003d st-1\n )\n )\n if(st\u003e0)\n \"positivie\"\n else if(st\u003c0)\n \"negative\"\n else\n \"neutral\"\n}\n\nsqlc.udf.register(\"sentiment\", sentiment _)",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:41:11.990",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "scala",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051655860_1808435087",
"id": "20200416-154055_475437300",
"dateCreated": "2020-04-16 15:40:55.860",
"status": "READY",
"progressUpdateIntervalMs": 500
},
{
"text": "%pyspark\n\n#declaring a function in Python\n\nimport re\n\ndef wordcount(a):\n return len(re.split(\"\\W+\",a))\n \nsqlContext.registerFunction(\"wordcount\", wordcount)\n",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:41:25.295",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/python"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051679740_-838401990",
"id": "20200416-154119_2099153382",
"dateCreated": "2020-04-16 15:41:19.740",
"status": "READY",
"progressUpdateIntervalMs": 500
},
{
"text": "%sql\n\n--using SQL to mix window data, with scala function and python function\n\nselect from_unixtime(createdAt) as created, screenName, sentiment(text) as sentiment, wordcount(text) as wordcount, text from tweets\n",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:41:36.240",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "sql",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/sql"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051685360_-1734620202",
"id": "20200416-154125_1588696872",
"dateCreated": "2020-04-16 15:41:25.360",
"status": "READY",
"progressUpdateIntervalMs": 500
},
{
"text": "%sql\n\n--ploting sentiment\n\nselect sentiment(text) as sentiment, count(1) from tweets group by sentiment(text)",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:41:52.595",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "scala",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051703200_-1582474404",
"id": "20200416-154143_91820265",
"dateCreated": "2020-04-16 15:41:43.200",
"status": "READY",
"progressUpdateIntervalMs": 500
},
{
"text": "%sql\n\n--show most common words for positive and negatives tweets\n\nselect word, sentiment, count(1) as cnt\nfrom \n(\n select from_unixtime(createdAt) as created, screenName, sentiment(text) as sentiment, wordcount(text) as wordcount, text from tweets\n) sub1\nlateral view explode(split(text, \u0027 \u0027)) t as word \nwhere sentiment \u003c\u003e \u0027neutral\u0027\ngroup by word, sentiment\n--having count(1) \u003e 1\norder by cnt desc\n\n",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:42:05.860",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "scala",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051721600_-463177037",
"id": "20200416-154201_814758669",
"dateCreated": "2020-04-16 15:42:01.600",
"status": "READY",
"progressUpdateIntervalMs": 500
},
{
"text": "%sql\n\nselect minute, sentiment, count(1) as cnt from\n(\n select substr(from_unixtime(createdAt), 0, 16) as minute, screenName, sentiment(text) as sentiment, wordcount(text) as wordcount, text from tweets\n) sub1\ngroup by minute, sentiment\norder by minute\n",
"user": "anonymous",
"dateUpdated": "2020-04-16 15:42:17.730",
"config": {
"colWidth": 12.0,
"fontSize": 9.0,
"enabled": true,
"results": {},
"editorSetting": {
"language": "scala",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1587051732930_373519832",
"id": "20200416-154212_1403931045",
"dateCreated": "2020-04-16 15:42:12.930",
"status": "READY",
"progressUpdateIntervalMs": 500
}
],
"name": "demo-notebooks/Twitter",
"id": "2F6NCTHH5",
"noteParams": {},
"noteForms": {},
"angularObjects": {
"md:shared_process": [],
"sh:shared_process": [],
"spark:shared_process": []
},
"config": {
"isZeppelinNotebookCronEnable": false
},
"info": {}
}
Loading