diff --git a/parser/cometCalendarParser.go b/parser/cometCalendarParser.go index 4575b6c..286ea67 100644 --- a/parser/cometCalendarParser.go +++ b/parser/cometCalendarParser.go @@ -12,7 +12,6 @@ import ( "regexp" "slices" "strings" - "time" "github.com/UTDNebula/api-tools/scrapers" "github.com/UTDNebula/api-tools/utils" @@ -137,6 +136,7 @@ var DefaultValid []string = []string{ "RCW", } +// ParseCometCalendar reformats the comet calendar data into uploadable json in Mongo func ParseCometCalendar(inDir string, outDir string) { calendarFile, err := os.ReadFile(inDir + "/cometCalendarScraped.json") @@ -251,21 +251,18 @@ func ParseCometCalendar(inDir string, outDir string) { utils.WriteJSON(fmt.Sprintf("%s/cometCalendar.json", outDir), result) } -// getAbbreviations dynamically retrieves the all of the locations abbreviations +// getLocationAbbreviations dynamically retrieves the all of the locations abbreviations func getLocationAbbreviations(inDir string) (map[string]string, []string, error) { // Get the locations from the map scraper var mapFile []byte - mapFile, err := os.ReadFile(inDir + "/mapLocations.json") if err != nil { if os.IsNotExist(err) { - // Scrape the data if the it doesn't exist yet and then get the map file + // Force scrape the locations if it doesn't exist. Get the map file again scrapers.ScrapeMapLocations(inDir) - time.Sleep(2 * time.Second) ParseMapLocations(inDir, inDir) - time.Sleep(2 * time.Second) - // If fail to get the locations again, it's not because location is unscraped + // If it fails to get the locations again, it's not because location is unscraped mapFile, err = os.ReadFile(inDir + "/mapLocations.json") if err != nil { return nil, nil, err @@ -274,7 +271,6 @@ func getLocationAbbreviations(inDir string) (map[string]string, []string, error) return nil, nil, err } } - var locations []schema.MapBuilding if err = json.Unmarshal(mapFile, &locations); err != nil { return nil, nil, err @@ -288,7 +284,7 @@ func getLocationAbbreviations(inDir string) (map[string]string, []string, error) // Trim the following acronym in the name trimmedName := strings.Split(*location.Name, " (")[0] // Fallback on the locations that have no acronyms - abbreviation := "" + var abbreviation string if location.Acronym != nil { abbreviation = *location.Acronym } diff --git a/parser/gradeLoader.go b/parser/gradeLoader.go index 57c7d7a..cff86a3 100644 --- a/parser/gradeLoader.go +++ b/parser/gradeLoader.go @@ -100,7 +100,7 @@ func csvToMap(csvFile *os.File, logFile *os.File) map[string][]int { // optional columns for _, name := range []string{"W", "P", "CR", "NC", "I"} { if _, ok := indexMap[name]; !ok { - logFile.WriteString(fmt.Sprintf("could not find %s column\n", name)) + fmt.Fprintf(logFile, "could not find %s column\n", name) } } diff --git a/scrapers/adacemicCalendars.go b/scrapers/adacemicCalendars.go index b641619..e321a07 100644 --- a/scrapers/adacemicCalendars.go +++ b/scrapers/adacemicCalendars.go @@ -41,16 +41,34 @@ func ScrapeAcademicCalendars(outDir string) { } // Go to listings page - chromedp.RunResponse(chromedpCtx, + _, err = chromedp.RunResponse(chromedpCtx, chromedp.Navigate(`https://www.utdallas.edu/academics/calendar/`), ) + if err != nil { + panic(err) + } + + // Selector for the scraping the calendar nodes + currentSel := `a.wp-block-button__link` + futureSel := `//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a` + pastSel := `//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a` // Extract data from links // Current academicCalendars := []AcademicCalendar{{"", "", "current"}} - chromedp.Run(chromedpCtx, chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery)) + err = chromedp.Run(chromedpCtx, + chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } var currentNode []*cdp.Node - chromedp.Run(chromedpCtx, chromedp.Nodes("a.wp-block-button__link", ¤tNode, chromedp.ByQuery)) + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(currentSel, ¤tNode, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } for i := 0; i < len(currentNode[0].Attributes); i += 2 { if currentNode[0].Attributes[i] == "href" { academicCalendars[0].Href = currentNode[0].Attributes[i+1] @@ -59,29 +77,42 @@ func ScrapeAcademicCalendars(outDir string) { // Future list var futureNodes []*cdp.Node - chromedp.Run(chromedpCtx, - chromedp.Nodes(`//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a`, &futureNodes, chromedp.BySearch), + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(futureSel, &futureNodes, chromedp.BySearch), ) - academicCalendars = append(academicCalendars, extractTextAndHref(futureNodes, "future", chromedpCtx)...) + if err != nil { + panic(err) + } + newCalendars := extractTextAndHref(futureNodes, "future", chromedpCtx) + academicCalendars = append(academicCalendars, newCalendars...) // Past list var pastNodes []*cdp.Node - chromedp.Run(chromedpCtx, - chromedp.Nodes(`//h2[normalize-space(text())="Past Terms"]/following-sibling::div[1]//a`, &pastNodes, chromedp.BySearch), + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(pastSel, &pastNodes, chromedp.BySearch), ) - academicCalendars = append(academicCalendars, extractTextAndHref(pastNodes, "past", chromedpCtx)...) + if err != nil { + panic(err) + } + newCalendars = extractTextAndHref(pastNodes, "past", chromedpCtx) + academicCalendars = append(academicCalendars, newCalendars...) // Don't need ChromeDP anymore cancel() // Download all PDFs for _, academicCalendar := range academicCalendars { - downloadPdfFromBox(academicCalendar.Href, academicCalendar.Time+"-"+academicCalendar.Title, outSubDir) + downloadPdfFromBox( + academicCalendar.Href, + academicCalendar.Time+"-"+academicCalendar.Title, + outSubDir, + ) } } func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Context) []AcademicCalendar { output := []AcademicCalendar{} + var err error // Extract href and text for _, n := range nodes { @@ -93,8 +124,12 @@ func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Cont } } // Get inner text - chromedp.Run(chromedpCtx, chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery)) - + err = chromedp.Run(chromedpCtx, + chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } output = append(output, AcademicCalendar{text, href, time}) } diff --git a/scrapers/cometCalendar.go b/scrapers/cometCalendar.go index 3a4f613..e2ac39d 100644 --- a/scrapers/cometCalendar.go +++ b/scrapers/cometCalendar.go @@ -19,7 +19,7 @@ import ( "go.mongodb.org/mongo-driver/bson/primitive" ) -const CAL_URL string = "https://calendar.utdallas.edu/api/2/events" +const COMET_CALENDAR_URL string = "https://calendar.utdallas.edu/api/2/events" // RawEvent mirrors the nested event payload returned by the calendar API. type RawEvent struct { @@ -66,16 +66,16 @@ func ScrapeCometCalendar(outDir string) { calendarEvents = append(calendarEvents, schema.Event{ Id: primitive.NewObjectID(), - Summary: convert[string](rawEvent.Event["title"]), + Summary: to[string](rawEvent.Event["title"]), Location: getEventLocation(rawEvent), StartTime: startTime, EndTime: endTime, - Description: convert[string](rawEvent.Event["description_text"]), + Description: to[string](rawEvent.Event["description_text"]), EventType: eventTypes, TargetAudience: targetAudiences, Topic: eventTopics, EventTags: tags, - EventWebsite: convert[string](rawEvent.Event["url"]), + EventWebsite: to[string](rawEvent.Event["url"]), Department: departments, ContactName: contactInfo[0], ContactEmail: contactInfo[1], @@ -94,10 +94,10 @@ func ScrapeCometCalendar(outDir string) { log.Printf("Finished scraping %d events successfully!\n\n", len(calendarEvents)) } -// scrapeAndUnmarshal fetches a calendar page and decodes it into data. +// callAndUnmarshal fetches a calendar page and decodes it into data. func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { // Call API to get the byte data - calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", CAL_URL, page) + calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", COMET_CALENDAR_URL, page) request, err := http.NewRequest("GET", calendarUrl, nil) if err != nil { return err @@ -130,24 +130,20 @@ func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) // getTime parses the start and end time of the event func getTime(event RawEvent) (time.Time, time.Time) { - instance := convert[map[string]any]( - convert[map[string]any]( - convert[[]any](event.Event["event_instances"])[0])["event_instance"]) + instance := to[map[string]any](to[map[string]any](to[[]any](event.Event["event_instances"])[0])["event_instance"]) // Converts RFC3339 timestamp string to time.Time - startTime, err := time.Parse(time.RFC3339, convert[string](instance["start"])) + startTime, err := time.Parse(time.RFC3339, to[string](instance["start"])) if err != nil { panic(err) } - var endTime time.Time - if convert[string](instance["end"]) != "" { - endTime, err = time.Parse(time.RFC3339, convert[string](instance["end"])) + endTime := startTime + if to[string](instance["end"]) != "" { + endTime, err = time.Parse(time.RFC3339, to[string](instance["end"])) if err != nil { panic(err) } - } else { - endTime = startTime } return startTime, endTime @@ -155,10 +151,9 @@ func getTime(event RawEvent) (time.Time, time.Time) { // getEventLocation parses the location of the event func getEventLocation(event RawEvent) string { - building := convert[string](event.Event["location_name"]) - room := convert[string](event.Event["room_number"]) + building := to[string](event.Event["location_name"]) + room := to[string](event.Event["room_number"]) location := strings.Trim(fmt.Sprintf("%s, %s", building, room), " ,") - return location } @@ -168,21 +163,21 @@ func getFilters(event RawEvent) ([]string, []string, []string) { audiences := []string{} topics := []string{} - filters := convert[map[string]any](event.Event["filters"]) + filters := to[map[string]any](event.Event["filters"]) - rawTypes := convert[[]any](filters["event_types"]) + rawTypes := to[[]any](filters["event_types"]) for _, rawType := range rawTypes { - types = append(types, convert[string](convert[map[string]any](rawType)["name"])) + types = append(types, to[string](to[map[string]any](rawType)["name"])) } - rawAudiences := convert[[]any](filters["event_target_audience"]) + rawAudiences := to[[]any](filters["event_target_audience"]) for _, audience := range rawAudiences { - audiences = append(audiences, convert[string](convert[map[string]any](audience)["name"])) + audiences = append(audiences, to[string](to[map[string]any](audience)["name"])) } - rawTopics := convert[[]any](filters["event_topic"]) + rawTopics := to[[]any](filters["event_topic"]) for _, topic := range rawTopics { - topics = append(topics, convert[string](convert[map[string]any](topic)["name"])) + topics = append(topics, to[string](to[map[string]any](topic)["name"])) } return types, audiences, topics @@ -193,14 +188,14 @@ func getDepartmentsAndTags(event RawEvent) ([]string, []string) { departments := []string{} tags := []string{} - rawTags := convert[[]any](event.Event["tags"]) + rawTags := to[[]any](event.Event["tags"]) for _, tag := range rawTags { - tags = append(tags, convert[string](tag)) + tags = append(tags, to[string](tag)) } - rawDeparments := convert[[]any](event.Event["departments"]) + rawDeparments := to[[]any](event.Event["departments"]) for _, deparment := range rawDeparments { - departments = append(departments, convert[string](convert[map[string]any](deparment)["name"])) + departments = append(departments, to[string](to[map[string]any](deparment)["name"])) } return departments, tags @@ -211,20 +206,20 @@ func getContactInfo(event RawEvent) [3]string { // Note that some events won't have contact phone number contactInfo := [3]string{} - rawContactInfo := convert[map[string]any](event.Event["custom_fields"]) + rawContactInfo := to[map[string]any](event.Event["custom_fields"]) for i, infoField := range []string{ "contact_information_name", "contact_information_email", "contact_information_phone", } { - contactInfo[i] = convert[string](rawContactInfo[infoField]) + contactInfo[i] = to[string](rawContactInfo[infoField]) } return contactInfo } -// convert() attempts to convert data into types for this scraper -func convert[T []any | map[string]any | string](data any) T { +// to attempts to convert data into types for this scraper, or return nil value +func to[T []any | map[string]any | string](data any) T { if newTypedData, ok := data.(T); ok { return newTypedData } diff --git a/uploader/pipelines/trends.go b/uploader/pipelines/trends.go new file mode 100644 index 0000000..caa033c --- /dev/null +++ b/uploader/pipelines/trends.go @@ -0,0 +1,149 @@ +package pipelines + +import ( + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// TrendsCourseSectionsPipeline links course documents to their section records for trends-specific aggregation. +var TrendsCourseSectionsPipeline = mongo.Pipeline{ + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "sections"}, + {Key: "localField", Value: "sections"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "sections"}, + }, + }, + }, + bson.D{ + {Key: "$project", + Value: bson.D{ + {Key: "subject_prefix", Value: 1}, + {Key: "course_number", Value: 1}, + {Key: "sections", Value: 1}, + }, + }, + }, + bson.D{ + {Key: "$unwind", + Value: bson.D{ + {Key: "path", Value: "$sections"}, + {Key: "preserveNullAndEmptyArrays", Value: false}, + }, + }, + }, + bson.D{ + {Key: "$group", + Value: bson.D{ + {Key: "_id", + Value: bson.D{ + {Key: "$concat", + Value: bson.A{ + "$subject_prefix", + "$course_number", + }, + }, + }, + }, + {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, + }, + }, + }, +} + +// TrendsProfSectionsPipeline denormalizes professor records with their taught sections for trends-specific aggregation. +var TrendsProfSectionsPipeline = mongo.Pipeline{ + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "sections"}, + {Key: "localField", Value: "sections"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "sections"}, + }, + }, + }, + bson.D{ + {Key: "$project", + Value: bson.D{ + {Key: "first_name", Value: 1}, + {Key: "last_name", Value: 1}, + {Key: "sections", Value: 1}, + }, + }, + }, +} + +// TrendsCourseProfSectionsPipeline links combination of professor and course to the sections for trends-specific aggregation. +var TrendsCourseProfSectionsPipeline = mongo.Pipeline{ + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "sections"}, + {Key: "localField", Value: "sections"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "sections"}, + }, + }, + }, + bson.D{ + {Key: "$project", + Value: bson.D{ + {Key: "subject_prefix", Value: 1}, + {Key: "course_number", Value: 1}, + {Key: "sections", Value: 1}, + }, + }, + }, + + bson.D{ + {Key: "$unwind", + Value: bson.D{ + {Key: "path", Value: "$sections"}, + {Key: "preserveNullAndEmptyArrays", Value: false}, + }, + }, + }, + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "professors"}, + {Key: "localField", Value: "sections.professors"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "professors"}, + }, + }, + }, + bson.D{ + {Key: "$unwind", + Value: bson.D{ + {Key: "path", Value: "$professors"}, + {Key: "preserveNullAndEmptyArrays", Value: false}, + }, + }, + }, + + bson.D{ + {Key: "$group", + Value: bson.D{ + {Key: "_id", + Value: bson.D{ + {Key: "$concat", + Value: bson.A{ + "$subject_prefix", + "$course_number", + " ", + "$professors.first_name", + " ", + "$professors.last_name", + }, + }, + }, + }, + {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, + }, + }, + }, +} diff --git a/uploader/pipelines/trends_course_and_prof_section.go b/uploader/pipelines/trends_course_and_prof_section.go deleted file mode 100644 index fc83589..0000000 --- a/uploader/pipelines/trends_course_and_prof_section.go +++ /dev/null @@ -1,77 +0,0 @@ -package pipelines - -import ( - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -var TrendsCourseProfSectionsPipeline = mongo.Pipeline{ - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "sections"}, - {Key: "localField", Value: "sections"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "sections"}, - }, - }, - }, - bson.D{ - {Key: "$project", - Value: bson.D{ - {Key: "subject_prefix", Value: 1}, - {Key: "course_number", Value: 1}, - {Key: "sections", Value: 1}, - }, - }, - }, - - bson.D{ - {Key: "$unwind", - Value: bson.D{ - {Key: "path", Value: "$sections"}, - {Key: "preserveNullAndEmptyArrays", Value: false}, - }, - }, - }, - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "professors"}, - {Key: "localField", Value: "sections.professors"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "professors"}, - }, - }, - }, - bson.D{ - {Key: "$unwind", - Value: bson.D{ - {Key: "path", Value: "$professors"}, - {Key: "preserveNullAndEmptyArrays", Value: false}, - }, - }, - }, - - bson.D{ - {Key: "$group", - Value: bson.D{ - {Key: "_id", - Value: bson.D{ - {Key: "$concat", - Value: bson.A{ - "$subject_prefix", - "$course_number", - " ", - "$professors.first_name", - " ", - "$professors.last_name", - }, - }, - }, - }, - {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, - }, - }, - }, -} diff --git a/uploader/pipelines/trends_course_sections.go b/uploader/pipelines/trends_course_sections.go deleted file mode 100644 index 87fce4d..0000000 --- a/uploader/pipelines/trends_course_sections.go +++ /dev/null @@ -1,54 +0,0 @@ -package pipelines - -import ( - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// TrendsCourseSectionsPipeline links course documents to their section records for trend reporting. -var TrendsCourseSectionsPipeline = mongo.Pipeline{ - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "sections"}, - {Key: "localField", Value: "sections"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "sections"}, - }, - }, - }, - bson.D{ - {Key: "$project", - Value: bson.D{ - {Key: "subject_prefix", Value: 1}, - {Key: "course_number", Value: 1}, - {Key: "sections", Value: 1}, - }, - }, - }, - bson.D{ - {Key: "$unwind", - Value: bson.D{ - {Key: "path", Value: "$sections"}, - {Key: "preserveNullAndEmptyArrays", Value: false}, - }, - }, - }, - bson.D{ - {Key: "$group", - Value: bson.D{ - {Key: "_id", - Value: bson.D{ - {Key: "$concat", - Value: bson.A{ - "$subject_prefix", - "$course_number", - }, - }, - }, - }, - {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, - }, - }, - }, -} diff --git a/uploader/pipelines/trends_prof_sections.go b/uploader/pipelines/trends_prof_sections.go deleted file mode 100644 index 2961c97..0000000 --- a/uploader/pipelines/trends_prof_sections.go +++ /dev/null @@ -1,29 +0,0 @@ -package pipelines - -import ( - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// TrendsProfSectionsPipeline denormalizes professor records with their taught sections for analytics. -var TrendsProfSectionsPipeline = mongo.Pipeline{ - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "sections"}, - {Key: "localField", Value: "sections"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "sections"}, - }, - }, - }, - bson.D{ - {Key: "$project", - Value: bson.D{ - {Key: "first_name", Value: 1}, - {Key: "last_name", Value: 1}, - {Key: "sections", Value: 1}, - }, - }, - }, -}