Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ func main() {
mapFlag := flag.Bool("map", false, "Alongside -scrape, -parse, or -upload, signifies that the UTD map should be scraped/parsed/uploaded.")
// Flag for academic calendar scraping
academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.")
// Flag for degree scraping and parsing
degrees := flag.Bool("degrees", false, "Alongside -scrape or -parse, signifies that the degrees should be scraped/parsed.")

// Flags for parsing
parse := flag.Bool("parse", false, "Puts the tool into parsing mode.")
Expand Down Expand Up @@ -118,6 +120,8 @@ func main() {
scrapers.ScrapeMapLocations(*outDir)
case *academicCalendars:
scrapers.ScrapeAcademicCalendars(*outDir)
case *degrees:
scrapers.ScrapeDegrees(*outDir)
default:
log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!")
}
Expand All @@ -135,6 +139,8 @@ func main() {
parser.ParseAcademicCalendars(*inDir, *outDir)
case *scrapeDiscounts:
parser.ParseDiscounts(*inDir, *outDir)
case *degrees:
parser.ParseDegrees(*inDir, *outDir)
default:
parser.Parse(*inDir, *outDir, *csvDir, *skipValidation)
}
Expand Down
155 changes: 155 additions & 0 deletions parser/degreeParser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package parser

import (
"encoding/json"
"fmt"
"log"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/UTDNebula/api-tools/utils"
)

type AcademicProgram struct {
Title string `bson:"name" json:"name"`
School string `bson:"school" json:"school"`
DegreeOptions []Degree `bson:"degree_options" json:"degree_options"`
AreasOfInterest []string `bson:"areas_of_interest" json:"areas_of_interest"`
}

type Degree struct {
Level string `bson:"level" json:"level"`
PublicUrl string `bson:"public_url" json:"public_url"`
CipCode string `bson:"cip_code" json:"cip_code"`
StemDesignated bool `bson:"stem_designated" json:"stem_designated"`
JointProgram bool `bson:"joint_program" json:"joint_program"`
}

// Parses scarped degree HTML and outputs the data in JSON
func ParseDegrees(inDir string, outDir string) {
// Read the scraped HTML file
htmlPath := fmt.Sprintf("%s/degreesScraped.html", inDir)
htmlBytes, err := os.ReadFile(htmlPath)
if err != nil {
log.Fatalf("could not read HTML file: %v", err)
}
utils.VPrintf("Read %d bytes from %s", len(htmlBytes), htmlPath)

// Parse the document
page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes)))
if err != nil {
log.Fatalf("failed to parse HTML: %v", err)
}

// Find main content
content := page.Find("article .col-sm-12").First()
if content.Length() == 0 {
log.Fatalf("failed to find content area")
}
utils.VPrintf("Found main content area")

// Generate all possible combinations of degree filters
// This is done to cover all degrees from different schools e.g. ECS, NSM, etc
allProgramHTMLs := generateAllCombinations()
utils.VPrintf("Generated %d program combinations to search", len(allProgramHTMLs))

var allPrograms []AcademicProgram
for _, programHTML := range allProgramHTMLs {
content.Find(programHTML).Each(func(i int, s *goquery.Selection) {
extractProgram(s, &allPrograms)
})
}
utils.VPrintf("Extracted %d programs", len(allPrograms))

// Convert to JSON
marshalled, err := json.MarshalIndent(allPrograms, "", "\t")
if err != nil {
log.Fatalf("could not convert programs to JSON format: %v", err)
}

// Write to output file
outFile, err := os.Create(fmt.Sprintf("%s/degrees.json", outDir))
if err != nil {
log.Fatalf("could not create output file: %v", err)
}
defer outFile.Close()

_, err = outFile.Write(marshalled)
if err != nil {
log.Fatalf("could not write to output file: %v", err)
}
utils.VPrintf("Successfully wrote degrees to %s/degrees.json", outDir)
}

func extractProgram(selection *goquery.Selection, programs *[]AcademicProgram) {
header := selection.Find("div > h3").Parent()
title := header.Find("h3")
school := header.Find("div.school")
utils.VPrintf("Extracting program: %s (%s)", strings.TrimSpace(title.Text()), strings.TrimSpace(school.Text()))

var degrees []Degree
selection.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) {
// The alt attribute represents the Degree Option
// Examples: BS, MS, PHD
degreeOption, exists := degreeLink.Attr("alt")
if !exists {
log.Println("error parsing alt value:")
return
}

// Extracts the URL to the degree's page.
urlForDegree, exists := degreeLink.Attr("href")
if !exists {
log.Println("Error parsing href value:")
return
}

// Extracts Classification of Instructional Programs Codes
// These codes provide a standardized system for reporting data about academic programs across different colleges and universities
cipCode := degreeLink.Find("div.cip_code")

// Extracts the footnote from the degree HTML
// Relevant footnotes are 'STEM-Designated' and 'Joint Program'
footnote := degreeLink.Find("div.footnote")

degrees = append(degrees, Degree{
Level: degreeOption,
PublicUrl: strings.TrimSpace(urlForDegree),
CipCode: strings.TrimSpace(cipCode.Text()),
StemDesignated: strings.Contains(strings.TrimSpace(footnote.Text()), "STEM-Designated"),
JointProgram: strings.Contains(strings.TrimSpace(footnote.Text()), "Joint Program"),
})
})
utils.VPrintf(" Found %d degrees", len(degrees))

// Extracts a list of tags that correlate to what might interest a student
// Example for Computer Science: Artificial intelligence, AI, computer science, software, robotics, computer vision, digital forensics
areasOfInterest := selection.Find("div.areas_of_interest.d-none").First()

newProgram := AcademicProgram{
Title: strings.TrimSpace(title.Text()),
School: strings.TrimSpace(school.Text()),
DegreeOptions: degrees,
AreasOfInterest: strings.Split(strings.TrimSpace(areasOfInterest.Text()), ", "),
}
utils.VPrintf(" Areas of interest: %d topics", len(newProgram.AreasOfInterest))

*programs = append(*programs, newProgram)
}

// Generates a list of all possible HTML endpoints for a degree from the HTML Page
// Each endpoint corresponds to a specific school, combining it with common CSS selectors used in the document structure
func generateAllCombinations() []string {
// List of schools for which we need to generate combination selectors
schools := []string{"bass", "jindal", "nsm", "ecs", "bbs", "epps"}

var combinations []string

// Loop through each school and generate the corresponding HTML selector
for _, s := range schools {
combinations = append(combinations, fmt.Sprintf("div .element-item.all.alldegrees.allschools.academic.%s", s))
}

return combinations
}
46 changes: 46 additions & 0 deletions scrapers/degrees.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package scrapers

import (
"fmt"
"log"
"os"
"path/filepath"

"github.com/UTDNebula/api-tools/utils"
"github.com/chromedp/chromedp"
)

func ScrapeDegrees(outDir string) {
// Define the URL
const URL = "https://academics.utdallas.edu/degrees/#filter=.alldegrees.bass"

ctx, cancel := utils.InitChromeDp()
defer cancel()

var html string
log.Println("Scraping Degrees!")
err := chromedp.Run(ctx,
chromedp.Navigate(URL),
chromedp.WaitVisible("body", chromedp.ByQuery),
chromedp.OuterHTML("html", &html, chromedp.ByQuery),
)
if err != nil {
log.Panicf("failed to scrape: %v", err)
}

// Ensure the output directory exists
outputPath := filepath.Join(outDir, "degrees")
err = os.MkdirAll(outputPath, os.ModePerm)
if err != nil {
log.Panicf("failed to create directory: %v", err)
}

// Write raw HTML to file
outPath := fmt.Sprintf("%s/degreesScraped.html", outDir)
err = os.WriteFile(outPath, []byte(html), 0644)
if err != nil {
panic(err)
}

log.Printf("Finished scraping discount page successfully!\n\n")
}