sw-jobs-go/scraper.go

package main

import (
	"context"
	"fmt"
	"strings"

	"github.com/aws/aws-lambda-go/lambda"
	"github.com/gocolly/colly"
)

type job struct {
	Title       string `json:"title"`
	Logo        string `json:"logo"`
	Company     string `json:"company"`
	Location    string `json:"location"`
	Type        string `json:"type"`
	Description string `json:"description"`
	Link        string `json:"link"`
	Skills      skills `json:"skills"`
}

type skills struct {
	React      bool `json:"react"`
	Python     bool `json:"python"`
	Golang     bool `json:"golang"`
	Svelte     bool `json:"svelte"`
	Nextjs     bool `json:"nextjs"`
	Typescript bool `json:"typescript"`
}

func skillChecker(description string) skills {
	return skills{
		React:      strings.Contains(description, "React"),
		Python:     strings.Contains(description, "Python"),
		Golang:     strings.Contains(description, "Go"),
		Svelte:     strings.Contains(description, "Svelte"),
		Nextjs:     strings.Contains(description, "Next.js"),
		Typescript: strings.Contains(description, "TypeScript"),
	}
}

// Slice to store job details
var jobs []job

func handler(ctx context.Context) ([]job, error) {

	baseUrl := "https://thehub.io"
	searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs"
	// Instantiate default collector
	c := colly.NewCollector(
		// visit only the hub
		colly.AllowedDomains("www.thehub.io", "thehub.io"),

		// Cache responses to prevent multiple requests
		colly.CacheDir("./thehub_cache"),
	)

	// Slice of excluded words in the job titles
	excluded := []string{"senior", "lead"}
	// Instantiate a new collector to visit the job details page
	detailsCollector := c.Clone()
	// On every <div> element with class "card__content attribute call callback
	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {
		// Get the title and ensure it doesn't contain any excluded words
		title := e.ChildText("span.card-job-find-list__position")
		for _, excludedWord := range excluded {
			if strings.Contains(strings.ToLower(title), excludedWord) {
				return
			}
		}
		link := e.ChildAttr("a", "href")
		fullLink := baseUrl + link

		detailsCollector.Visit(fullLink)
	})

	detailsCollector.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})

	detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) {
		// Get logo and trim the url

		logo := e.ChildAttr("div.media-item__image", "style")
		cutLeft := "background-image:url("
		cutRight := ");"
		trimmedLogo := strings.Trim(logo, cutLeft+cutRight)

		// Get company name
		jobDetails := job{
			Title:       e.ChildText("h2[class=view-job-details__title]"),
			Logo:        trimmedLogo,
			Company:     e.ChildText(".bullet-inline-list > a:first-child"),
			Location:    e.ChildText(".bullet-inline-list > a:nth-child(2)"),
			Type:        e.ChildText(".bullet-inline-list > a:nth-child(3)"),
			Description: e.ChildText("content.text-block__content > span"),
			Link:        e.Request.URL.String(),
			Skills:      skillChecker(e.ChildText("content.text-block__content > span")),
		}
		jobs = append(jobs, jobDetails)
	})
	// Handle pagination
	c.OnHTML("a.page-link", func(e *colly.HTMLElement) {
		nextPage := e.Attr("href")
		if nextPage != "" {
			fullNextPage := baseUrl + nextPage
			fmt.Println("Visiting next page:", fullNextPage)
			e.Request.Visit(fullNextPage)
		}
	})

	c.Visit(searchString)
	return jobs, nil
}

func main() {
	lambda.Start(handler)
}
initial commit 2024-06-08 14:11:40 +00:00			`package main`

			`import (`
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`"context"`
initial commit 2024-06-08 14:11:40 +00:00			`"fmt"`
			`"strings"`

added aws lambda integrations 2024-06-08 14:48:00 +00:00			`"github.com/aws/aws-lambda-go/lambda"`
initial commit 2024-06-08 14:11:40 +00:00			`"github.com/gocolly/colly"`
			`)`

			`type job struct {`
			Title string `json:"title"`
			Logo string `json:"logo"`
			Company string `json:"company"`
			Location string `json:"location"`
			Type string `json:"type"`
			Description string `json:"description"`
			Link string `json:"link"`
			Skills skills `json:"skills"`
			`}`

			`type skills struct {`
			React bool `json:"react"`
			Python bool `json:"python"`
			Golang bool `json:"golang"`
			Svelte bool `json:"svelte"`
			Nextjs bool `json:"nextjs"`
			Typescript bool `json:"typescript"`
			`}`

			`func skillChecker(description string) skills {`
			`return skills{`
			`React: strings.Contains(description, "React"),`
			`Python: strings.Contains(description, "Python"),`
			`Golang: strings.Contains(description, "Go"),`
			`Svelte: strings.Contains(description, "Svelte"),`
			`Nextjs: strings.Contains(description, "Next.js"),`
			`Typescript: strings.Contains(description, "TypeScript"),`
			`}`
			`}`

			`// Slice to store job details`
			`var jobs []job`

added aws lambda integrations 2024-06-08 14:48:00 +00:00			`func handler(ctx context.Context) ([]job, error) {`
initial commit 2024-06-08 14:11:40 +00:00
			`baseUrl := "https://thehub.io"`
			`searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs"`
			`// Instantiate default collector`
			`c := colly.NewCollector(`
			`// visit only the hub`
			`colly.AllowedDomains("www.thehub.io", "thehub.io"),`

			`// Cache responses to prevent multiple requests`
			`colly.CacheDir("./thehub_cache"),`
			`)`

			`// Slice of excluded words in the job titles`
			`excluded := []string{"senior", "lead"}`
			`// Instantiate a new collector to visit the job details page`
			`detailsCollector := c.Clone()`
			`// On every <div> element with class "card__content attribute call callback`
			`c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {`
			`// Get the title and ensure it doesn't contain any excluded words`
			`title := e.ChildText("span.card-job-find-list__position")`
			`for _, excludedWord := range excluded {`
			`if strings.Contains(strings.ToLower(title), excludedWord) {`
			`return`
			`}`
			`}`
			`link := e.ChildAttr("a", "href")`
			`fullLink := baseUrl + link`

			`detailsCollector.Visit(fullLink)`
			`})`

			`detailsCollector.OnRequest(func(r *colly.Request) {`
			`fmt.Println("Visiting", r.URL.String())`
			`})`

			`detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) {`
			`// Get logo and trim the url`

			`logo := e.ChildAttr("div.media-item__image", "style")`
			`cutLeft := "background-image:url("`
			`cutRight := ");"`
			`trimmedLogo := strings.Trim(logo, cutLeft+cutRight)`

			`// Get company name`
			`jobDetails := job{`
			`Title: e.ChildText("h2[class=view-job-details__title]"),`
			`Logo: trimmedLogo,`
			`Company: e.ChildText(".bullet-inline-list > a:first-child"),`
			`Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"),`
			`Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"),`
			`Description: e.ChildText("content.text-block__content > span"),`
			`Link: e.Request.URL.String(),`
			`Skills: skillChecker(e.ChildText("content.text-block__content > span")),`
			`}`
			`jobs = append(jobs, jobDetails)`
			`})`
			`// Handle pagination`
			`c.OnHTML("a.page-link", func(e *colly.HTMLElement) {`
			`nextPage := e.Attr("href")`
			`if nextPage != "" {`
			`fullNextPage := baseUrl + nextPage`
			`fmt.Println("Visiting next page:", fullNextPage)`
			`e.Request.Visit(fullNextPage)`
			`}`
			`})`

			`c.Visit(searchString)`
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`return jobs, nil`
			`}`
initial commit 2024-06-08 14:11:40 +00:00
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`func main() {`
			`lambda.Start(handler)`
initial commit 2024-06-08 14:11:40 +00:00			`}`