sw-jobs-go/main.go

package main

import (
	"context"
	"fmt"
	"strings"
	"time"

	"github.com/aws/aws-lambda-go/lambda"
	"github.com/gocolly/colly"
)

type job struct {
	Title       string `json:"title"`
	Logo        string `json:"logo"`
	Company     string `json:"company"`
	Location    string `json:"location"`
	Type        string `json:"type"`
	Description string `json:"description"`
	Link        string `json:"link"`
	Skills      skills `json:"skills"`
}

type skills struct {
	React      bool `json:"react"`
	Python     bool `json:"python"`
	Golang     bool `json:"golang"`
	Svelte     bool `json:"svelte"`
	Nextjs     bool `json:"nextjs"`
	Typescript bool `json:"typescript"`
}

var (
	jobs      []job
	lastFetch time.Time
	cacheTTL  = time.Minute * 5
	jobLimit  = 20
)

func skillChecker(description string) skills {
	return skills{
		React:      strings.Contains(description, "React"),
		Python:     strings.Contains(description, "Python"),
		Golang:     strings.Contains(description, "Go"),
		Svelte:     strings.Contains(description, "Svelte"),
		Nextjs:     strings.Contains(description, "Next.js"),
		Typescript: strings.Contains(description, "TypeScript"),
	}
}

func fetchData() error {

	baseUrl := "https://thehub.io"
	// Instantiate default collector
	c := colly.NewCollector(
		// visit only the hub
		colly.AllowedDomains("www.thehub.io", "thehub.io"),

		// Cache responses to prevent multiple requests
		colly.CacheDir("./tmp"),
	)

	// Slice of excluded words in the job titles
	excluded := []string{"senior", "lead"}
	// Instantiate a new collector to visit the job details page
	detailsCollector := c.Clone()

	// Limit the number of jobs to fetch
	jobCount := 0

	// On every <div> element with class "card__content attribute call callback
	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {
		// Return if the job limit has been reached
		if jobCount >= jobLimit {
			return
		}
		// Get the title and ensure it doesn't contain any excluded words
		title := e.ChildText("span.card-job-find-list__position")
		for _, excludedWord := range excluded {
			if strings.Contains(strings.ToLower(title), excludedWord) {
				return
			}
		}
		link := e.ChildAttr("a", "href")
		fullLink := baseUrl + link

		detailsCollector.Visit(fullLink)
	})

	detailsCollector.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})

	detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) {
		if jobCount >= jobLimit {
			return
		}
		// Get logo and trim the url

		logo := e.ChildAttr("div.media-item__image", "style")
		cutLeft := "background-image:url("
		cutRight := ");"
		trimmedLogo := strings.Trim(logo, cutLeft+cutRight)

		// Get company name
		jobDetails := job{
			Title:       e.ChildText("h2[class=view-job-details__title]"),
			Logo:        trimmedLogo,
			Company:     e.ChildText(".bullet-inline-list > a:first-child"),
			Location:    e.ChildText(".bullet-inline-list > a:nth-child(2)"),
			Type:        e.ChildText(".bullet-inline-list > a:nth-child(3)"),
			Description: e.ChildText("content.text-block__content > span"),
			Link:        e.Request.URL.String(),
			Skills:      skillChecker(e.ChildText("content.text-block__content > span")),
		}
		jobs = append(jobs, jobDetails)
		jobCount++
	})
	// Handle pagination
	c.OnHTML("a.page-link", func(e *colly.HTMLElement) {
		nextPage := e.Attr("href")
		if nextPage != "" {
			fullNextPage := baseUrl + nextPage
			fmt.Println("Visiting next page:", fullNextPage)
			e.Request.Visit(fullNextPage)
		}
	})
	// Visit the initial URL to start scraping
	err := c.Visit("https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&search=developer&paid=true&countryCode=DK&sorting=newJobs")
	if err != nil {
		return err
	}
	return nil
}

func handler(ctx context.Context) ([]job, error) {
	// Check if cache is valid
	if time.Since(lastFetch) < cacheTTL && len(jobs) > 0 {
		return jobs, nil
	}

	// Fetch new data
	err := fetchData()
	if err != nil {
		return nil, err
	}

	// Update cache timestamp
	lastFetch = time.Now()

	return jobs, nil
}

func main() {
	lambda.Start(handler)
}
initial commit 2024-06-08 14:11:40 +00:00			`package main`

			`import (`
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`"context"`
initial commit 2024-06-08 14:11:40 +00:00			`"fmt"`
			`"strings"`
added in memory caching for 5 minutes 2024-06-08 14:56:35 +00:00			`"time"`
initial commit 2024-06-08 14:11:40 +00:00
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`"github.com/aws/aws-lambda-go/lambda"`
initial commit 2024-06-08 14:11:40 +00:00			`"github.com/gocolly/colly"`
			`)`

			`type job struct {`
			Title string `json:"title"`
			Logo string `json:"logo"`
			Company string `json:"company"`
			Location string `json:"location"`
			Type string `json:"type"`
			Description string `json:"description"`
			Link string `json:"link"`
			Skills skills `json:"skills"`
			`}`

			`type skills struct {`
			React bool `json:"react"`
			Python bool `json:"python"`
			Golang bool `json:"golang"`
			Svelte bool `json:"svelte"`
			Nextjs bool `json:"nextjs"`
			Typescript bool `json:"typescript"`
			`}`

added in memory caching for 5 minutes 2024-06-08 14:56:35 +00:00			`var (`
			`jobs []job`
			`lastFetch time.Time`
			`cacheTTL = time.Minute * 5`
set job scraping limit to 20 jobs 2024-06-08 15:04:46 +00:00			`jobLimit = 20`
added in memory caching for 5 minutes 2024-06-08 14:56:35 +00:00			`)`

initial commit 2024-06-08 14:11:40 +00:00			`func skillChecker(description string) skills {`
			`return skills{`
			`React: strings.Contains(description, "React"),`
			`Python: strings.Contains(description, "Python"),`
			`Golang: strings.Contains(description, "Go"),`
			`Svelte: strings.Contains(description, "Svelte"),`
			`Nextjs: strings.Contains(description, "Next.js"),`
			`Typescript: strings.Contains(description, "TypeScript"),`
			`}`
			`}`

added in memory caching for 5 minutes 2024-06-08 14:56:35 +00:00			`func fetchData() error {`
initial commit 2024-06-08 14:11:40 +00:00
			`baseUrl := "https://thehub.io"`
			`// Instantiate default collector`
			`c := colly.NewCollector(`
			`// visit only the hub`
			`colly.AllowedDomains("www.thehub.io", "thehub.io"),`

			`// Cache responses to prevent multiple requests`
i alrady hate lambda 2024-06-08 17:35:01 +00:00			`colly.CacheDir("./tmp"),`
initial commit 2024-06-08 14:11:40 +00:00			`)`

			`// Slice of excluded words in the job titles`
			`excluded := []string{"senior", "lead"}`
			`// Instantiate a new collector to visit the job details page`
			`detailsCollector := c.Clone()`
set job scraping limit to 20 jobs 2024-06-08 15:04:46 +00:00
			`// Limit the number of jobs to fetch`
			`jobCount := 0`

initial commit 2024-06-08 14:11:40 +00:00			`// On every <div> element with class "card__content attribute call callback`
			`c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {`
set job scraping limit to 20 jobs 2024-06-08 15:04:46 +00:00			`// Return if the job limit has been reached`
			`if jobCount >= jobLimit {`
			`return`
			`}`
initial commit 2024-06-08 14:11:40 +00:00			`// Get the title and ensure it doesn't contain any excluded words`
			`title := e.ChildText("span.card-job-find-list__position")`
			`for _, excludedWord := range excluded {`
			`if strings.Contains(strings.ToLower(title), excludedWord) {`
			`return`
			`}`
			`}`
			`link := e.ChildAttr("a", "href")`
			`fullLink := baseUrl + link`

			`detailsCollector.Visit(fullLink)`
			`})`

			`detailsCollector.OnRequest(func(r *colly.Request) {`
			`fmt.Println("Visiting", r.URL.String())`
			`})`

			`detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) {`
set job scraping limit to 20 jobs 2024-06-08 15:04:46 +00:00			`if jobCount >= jobLimit {`
			`return`
			`}`
initial commit 2024-06-08 14:11:40 +00:00			`// Get logo and trim the url`

			`logo := e.ChildAttr("div.media-item__image", "style")`
			`cutLeft := "background-image:url("`
			`cutRight := ");"`
			`trimmedLogo := strings.Trim(logo, cutLeft+cutRight)`

			`// Get company name`
			`jobDetails := job{`
			`Title: e.ChildText("h2[class=view-job-details__title]"),`
			`Logo: trimmedLogo,`
			`Company: e.ChildText(".bullet-inline-list > a:first-child"),`
			`Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"),`
			`Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"),`
			`Description: e.ChildText("content.text-block__content > span"),`
			`Link: e.Request.URL.String(),`
			`Skills: skillChecker(e.ChildText("content.text-block__content > span")),`
			`}`
			`jobs = append(jobs, jobDetails)`
set job scraping limit to 20 jobs 2024-06-08 15:04:46 +00:00			`jobCount++`
initial commit 2024-06-08 14:11:40 +00:00			`})`
			`// Handle pagination`
			`c.OnHTML("a.page-link", func(e *colly.HTMLElement) {`
			`nextPage := e.Attr("href")`
			`if nextPage != "" {`
			`fullNextPage := baseUrl + nextPage`
			`fmt.Println("Visiting next page:", fullNextPage)`
			`e.Request.Visit(fullNextPage)`
			`}`
			`})`
added in memory caching for 5 minutes 2024-06-08 14:56:35 +00:00			`// Visit the initial URL to start scraping`
			`err := c.Visit("https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&search=developer&paid=true&countryCode=DK&sorting=newJobs")`
			`if err != nil {`
			`return err`
			`}`
			`return nil`
			`}`

			`func handler(ctx context.Context) ([]job, error) {`
			`// Check if cache is valid`
			`if time.Since(lastFetch) < cacheTTL && len(jobs) > 0 {`
			`return jobs, nil`
			`}`

			`// Fetch new data`
			`err := fetchData()`
			`if err != nil {`
			`return nil, err`
			`}`

			`// Update cache timestamp`
			`lastFetch = time.Now()`
initial commit 2024-06-08 14:11:40 +00:00
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`return jobs, nil`
			`}`
initial commit 2024-06-08 14:11:40 +00:00
added aws lambda integrations 2024-06-08 14:48:00 +00:00			`func main() {`
			`lambda.Start(handler)`
initial commit 2024-06-08 14:11:40 +00:00			`}`