diff --git a/main.go b/main.go index 3f405d9..c3aae01 100644 --- a/main.go +++ b/main.go @@ -48,6 +48,8 @@ func main() { mapFlag := flag.Bool("map", false, "Alongside -scrape, -parse, or -upload, signifies that the UTD map should be scraped/parsed/uploaded.") // Flag for academic calendar scraping academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.") + // Flag for degree scraping and parsing + degrees := flag.Bool("degrees", false, "Alongside -scrape or -parse, signifies that the degrees should be scraped/parsed.") // Flags for parsing parse := flag.Bool("parse", false, "Puts the tool into parsing mode.") @@ -118,6 +120,8 @@ func main() { scrapers.ScrapeMapLocations(*outDir) case *academicCalendars: scrapers.ScrapeAcademicCalendars(*outDir) + case *degrees: + scrapers.ScrapeDegrees(*outDir) default: log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!") } @@ -135,6 +139,8 @@ func main() { parser.ParseAcademicCalendars(*inDir, *outDir) case *scrapeDiscounts: parser.ParseDiscounts(*inDir, *outDir) + case *degrees: + parser.ParseDegrees(*inDir, *outDir) default: parser.Parse(*inDir, *outDir, *csvDir, *skipValidation) } diff --git a/parser/degreeParser.go b/parser/degreeParser.go new file mode 100644 index 0000000..967b2fa --- /dev/null +++ b/parser/degreeParser.go @@ -0,0 +1,155 @@ +package parser + +import ( + "encoding/json" + "fmt" + "log" + "os" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/UTDNebula/api-tools/utils" +) + +type AcademicProgram struct { + Title string `bson:"name" json:"name"` + School string `bson:"school" json:"school"` + DegreeOptions []Degree `bson:"degree_options" json:"degree_options"` + AreasOfInterest []string `bson:"areas_of_interest" json:"areas_of_interest"` +} + +type Degree struct { + Level string `bson:"level" json:"level"` + PublicUrl string `bson:"public_url" json:"public_url"` + CipCode string `bson:"cip_code" json:"cip_code"` + StemDesignated bool `bson:"stem_designated" json:"stem_designated"` + JointProgram bool `bson:"joint_program" json:"joint_program"` +} + +// Parses scarped degree HTML and outputs the data in JSON +func ParseDegrees(inDir string, outDir string) { + // Read the scraped HTML file + htmlPath := fmt.Sprintf("%s/degreesScraped.html", inDir) + htmlBytes, err := os.ReadFile(htmlPath) + if err != nil { + log.Fatalf("could not read HTML file: %v", err) + } + utils.VPrintf("Read %d bytes from %s", len(htmlBytes), htmlPath) + + // Parse the document + page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes))) + if err != nil { + log.Fatalf("failed to parse HTML: %v", err) + } + + // Find main content + content := page.Find("article .col-sm-12").First() + if content.Length() == 0 { + log.Fatalf("failed to find content area") + } + utils.VPrintf("Found main content area") + + // Generate all possible combinations of degree filters + // This is done to cover all degrees from different schools e.g. ECS, NSM, etc + allProgramHTMLs := generateAllCombinations() + utils.VPrintf("Generated %d program combinations to search", len(allProgramHTMLs)) + + var allPrograms []AcademicProgram + for _, programHTML := range allProgramHTMLs { + content.Find(programHTML).Each(func(i int, s *goquery.Selection) { + extractProgram(s, &allPrograms) + }) + } + utils.VPrintf("Extracted %d programs", len(allPrograms)) + + // Convert to JSON + marshalled, err := json.MarshalIndent(allPrograms, "", "\t") + if err != nil { + log.Fatalf("could not convert programs to JSON format: %v", err) + } + + // Write to output file + outFile, err := os.Create(fmt.Sprintf("%s/degrees.json", outDir)) + if err != nil { + log.Fatalf("could not create output file: %v", err) + } + defer outFile.Close() + + _, err = outFile.Write(marshalled) + if err != nil { + log.Fatalf("could not write to output file: %v", err) + } + utils.VPrintf("Successfully wrote degrees to %s/degrees.json", outDir) +} + +func extractProgram(selection *goquery.Selection, programs *[]AcademicProgram) { + header := selection.Find("div > h3").Parent() + title := header.Find("h3") + school := header.Find("div.school") + utils.VPrintf("Extracting program: %s (%s)", strings.TrimSpace(title.Text()), strings.TrimSpace(school.Text())) + + var degrees []Degree + selection.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { + // The alt attribute represents the Degree Option + // Examples: BS, MS, PHD + degreeOption, exists := degreeLink.Attr("alt") + if !exists { + log.Println("error parsing alt value:") + return + } + + // Extracts the URL to the degree's page. + urlForDegree, exists := degreeLink.Attr("href") + if !exists { + log.Println("Error parsing href value:") + return + } + + // Extracts Classification of Instructional Programs Codes + // These codes provide a standardized system for reporting data about academic programs across different colleges and universities + cipCode := degreeLink.Find("div.cip_code") + + // Extracts the footnote from the degree HTML + // Relevant footnotes are 'STEM-Designated' and 'Joint Program' + footnote := degreeLink.Find("div.footnote") + + degrees = append(degrees, Degree{ + Level: degreeOption, + PublicUrl: strings.TrimSpace(urlForDegree), + CipCode: strings.TrimSpace(cipCode.Text()), + StemDesignated: strings.Contains(strings.TrimSpace(footnote.Text()), "STEM-Designated"), + JointProgram: strings.Contains(strings.TrimSpace(footnote.Text()), "Joint Program"), + }) + }) + utils.VPrintf(" Found %d degrees", len(degrees)) + + // Extracts a list of tags that correlate to what might interest a student + // Example for Computer Science: Artificial intelligence, AI, computer science, software, robotics, computer vision, digital forensics + areasOfInterest := selection.Find("div.areas_of_interest.d-none").First() + + newProgram := AcademicProgram{ + Title: strings.TrimSpace(title.Text()), + School: strings.TrimSpace(school.Text()), + DegreeOptions: degrees, + AreasOfInterest: strings.Split(strings.TrimSpace(areasOfInterest.Text()), ", "), + } + utils.VPrintf(" Areas of interest: %d topics", len(newProgram.AreasOfInterest)) + + *programs = append(*programs, newProgram) +} + +// Generates a list of all possible HTML endpoints for a degree from the HTML Page +// Each endpoint corresponds to a specific school, combining it with common CSS selectors used in the document structure +func generateAllCombinations() []string { + // List of schools for which we need to generate combination selectors + schools := []string{"bass", "jindal", "nsm", "ecs", "bbs", "epps"} + + var combinations []string + + // Loop through each school and generate the corresponding HTML selector + for _, s := range schools { + combinations = append(combinations, fmt.Sprintf("div .element-item.all.alldegrees.allschools.academic.%s", s)) + } + + return combinations +} diff --git a/scrapers/degrees.go b/scrapers/degrees.go new file mode 100644 index 0000000..5a9ac48 --- /dev/null +++ b/scrapers/degrees.go @@ -0,0 +1,46 @@ +package scrapers + +import ( + "fmt" + "log" + "os" + "path/filepath" + + "github.com/UTDNebula/api-tools/utils" + "github.com/chromedp/chromedp" +) + +func ScrapeDegrees(outDir string) { + // Define the URL + const URL = "https://academics.utdallas.edu/degrees/#filter=.alldegrees.bass" + + ctx, cancel := utils.InitChromeDp() + defer cancel() + + var html string + log.Println("Scraping Degrees!") + err := chromedp.Run(ctx, + chromedp.Navigate(URL), + chromedp.WaitVisible("body", chromedp.ByQuery), + chromedp.OuterHTML("html", &html, chromedp.ByQuery), + ) + if err != nil { + log.Panicf("failed to scrape: %v", err) + } + + // Ensure the output directory exists + outputPath := filepath.Join(outDir, "degrees") + err = os.MkdirAll(outputPath, os.ModePerm) + if err != nil { + log.Panicf("failed to create directory: %v", err) + } + + // Write raw HTML to file + outPath := fmt.Sprintf("%s/degreesScraped.html", outDir) + err = os.WriteFile(outPath, []byte(html), 0644) + if err != nil { + panic(err) + } + + log.Printf("Finished scraping discount page successfully!\n\n") +}