package main import ( "encoding/json" "fmt" "log" "os" "strings" "github.com/gocolly/colly" ) type job struct { Title string `json:"title"` Logo string `json:"logo"` Company string `json:"company"` Location string `json:"location"` Type string `json:"type"` Description string `json:"description"` Link string `json:"link"` Skills skills `json:"skills"` } type skills struct { React bool `json:"react"` Python bool `json:"python"` Golang bool `json:"golang"` Svelte bool `json:"svelte"` Nextjs bool `json:"nextjs"` Typescript bool `json:"typescript"` Tailwind bool `json:"tailwind"` } func skillChecker(description string) skills { return skills{ React: strings.Contains(description, "React"), Python: strings.Contains(description, "Python"), Golang: strings.Contains(description, "Golang"), Svelte: strings.Contains(description, "Svelte"), Nextjs: strings.Contains(description, "Next.js"), Typescript: strings.Contains(description, "TypeScript"), Tailwind: strings.Contains(description, "Tailwind"), } } func jobsToJson(file *os.File) { // Encode jobs slice to JSON encoder := json.NewEncoder(file) encoder.SetIndent("", " ") // Pretty-print with indentation if err := encoder.Encode(jobs); err != nil { log.Fatalf("Cannot write to file %q: %s", fName, err) } fmt.Println("Job details successfully written to", fName) } // Slice to store job details var ( jobs []job jobCount int maxJobs int = 30 fName string = "jobs.json" excluded = []string{"senior", "lead", "founder", "cto", "vp of"} ) func scrapeJobs() { file, err := os.Create(fName) if err != nil { log.Fatalf("Cannot create file %q: %s", fName, err) } defer file.Close() baseUrl := "https://thehub.io" searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" // Instantiate default collector c := colly.NewCollector( // visit only the hub colly.AllowedDomains("www.thehub.io", "thehub.io"), // Cache responses to prevent multiple requests colly.CacheDir("./thehub_cache"), ) // Instantiate a new collector to visit the job details page detailsCollector := c.Clone() // On every
element with class "card__content attribute call callback c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { //ensure only scrape the amount of jobs specified // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("span.card-job-find-list__position") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { return } } link := e.ChildAttr("a", "href") fullLink := baseUrl + link detailsCollector.Visit(fullLink) }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) { // Get logo and trim the url logo := e.ChildAttr("div.media-item__image", "style") cutLeft := "background-image:url(" cutRight := ");" trimmedLogo := strings.Trim(logo, cutLeft+cutRight) // Get job description // Get the HTML of the description descriptionHTML, err := e.DOM.Find("content.text-block__content > span").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } // Get company name jobDetails := job{ Title: e.ChildText("h2[class=view-job-details__title]"), Logo: trimmedLogo, Company: e.ChildText(".bullet-inline-list > a:first-child"), Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"), Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"), Description: descriptionHTML, Link: e.Request.URL.String(), Skills: skillChecker(e.ChildText("content.text-block__content > span")), } jobs = append(jobs, jobDetails) jobCount++ fmt.Println("Scraped job", jobCount) if jobCount == maxJobs { jobsToJson(file) os.Exit(0) } }) // Handle pagination c.OnHTML("a.page-link", func(e *colly.HTMLElement) { nextPage := e.Attr("href") if nextPage != "" { fullNextPage := baseUrl + nextPage fmt.Println("Visiting next page:", fullNextPage) e.Request.Visit(fullNextPage) } }) c.Visit(searchString) } func main() { scrapeJobs() }