sw-jobs-go-lambda/main.go

package main

import (
	"encoding/json"
	"fmt"
	"log"
	"os"
	"strings"

	"github.com/gocolly/colly"
)

type job struct {
	Title       string `json:"title"`
	Logo        string `json:"logo"`
	Company     string `json:"company"`
	Location    string `json:"location"`
	Type        string `json:"type"`
	Description string `json:"description"`
	Link        string `json:"link"`
	Skills      skills `json:"skills"`
}

type skills struct {
	React      bool `json:"react"`
	Python     bool `json:"python"`
	Golang     bool `json:"golang"`
	Svelte     bool `json:"svelte"`
	Nextjs     bool `json:"nextjs"`
	Typescript bool `json:"typescript"`
	Tailwind   bool `json:"tailwind"`
}

func skillChecker(description string) skills {
	return skills{
		React:      strings.Contains(description, "React"),
		Python:     strings.Contains(description, "Python"),
		Golang:     strings.Contains(description, "Golang"),
		Svelte:     strings.Contains(description, "Svelte"),
		Nextjs:     strings.Contains(description, "Next.js"),
		Typescript: strings.Contains(description, "TypeScript"),
		Tailwind:   strings.Contains(description, "Tailwind"),
	}
}
func jobsToJson(file *os.File) {
	// Encode jobs slice to JSON
	encoder := json.NewEncoder(file)
	encoder.SetIndent("", "  ") // Pretty-print with indentation
	if err := encoder.Encode(jobs); err != nil {
		log.Fatalf("Cannot write to file %q: %s", fName, err)
	}

	fmt.Println("Job details successfully written to", fName)
}

// Slice to store job details
var (
	jobs     []job
	jobCount int
	maxJobs  int    = 30
	fName    string = "jobs.json"
)

func main() {

	file, err := os.Create(fName)
	if err != nil {
		log.Fatalf("Cannot create file %q: %s", fName, err)
	}
	defer file.Close()
	baseUrl := "https://thehub.io"
	searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs"
	// Instantiate default collector
	c := colly.NewCollector(
		// visit only the hub
		colly.AllowedDomains("www.thehub.io", "thehub.io"),

		// Cache responses to prevent multiple requests
		colly.CacheDir("./thehub_cache"),
	)

	// Slice of excluded words in the job titles
	excluded := []string{"senior", "lead"}
	// Instantiate a new collector to visit the job details page
	detailsCollector := c.Clone()
	// On every <div> element with class "card__content attribute call callback
	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {
		//ensure only scrape the amount of jobs specified

		// Get the title and ensure it doesn't contain any excluded words
		title := e.ChildText("span.card-job-find-list__position")
		for _, excludedWord := range excluded {
			if strings.Contains(strings.ToLower(title), excludedWord) {
				return
			}
		}
		link := e.ChildAttr("a", "href")
		fullLink := baseUrl + link

		detailsCollector.Visit(fullLink)

	})

	detailsCollector.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})

	detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) {

		// Get logo and trim the url
		logo := e.ChildAttr("div.media-item__image", "style")
		cutLeft := "background-image:url("
		cutRight := ");"
		trimmedLogo := strings.Trim(logo, cutLeft+cutRight)

		// Get company name
		jobDetails := job{
			Title:       e.ChildText("h2[class=view-job-details__title]"),
			Logo:        trimmedLogo,
			Company:     e.ChildText(".bullet-inline-list > a:first-child"),
			Location:    e.ChildText(".bullet-inline-list > a:nth-child(2)"),
			Type:        e.ChildText(".bullet-inline-list > a:nth-child(3)"),
			Description: e.ChildText("content.text-block__content > span"),
			Link:        e.Request.URL.String(),
			Skills:      skillChecker(e.ChildText("content.text-block__content > span")),
		}
		jobs = append(jobs, jobDetails)
		jobCount++
		fmt.Println("Scraped job", jobCount)
		if jobCount == maxJobs {
			jobsToJson(file)
			os.Exit(0)
		}
	})
	// Handle pagination
	c.OnHTML("a.page-link", func(e *colly.HTMLElement) {
		nextPage := e.Attr("href")
		if nextPage != "" {
			fullNextPage := baseUrl + nextPage
			fmt.Println("Visiting next page:", fullNextPage)
			e.Request.Visit(fullNextPage)
		}
	})

	c.Visit(searchString)

}
initial commit 2024-06-08 14:11:40 +00:00			`package main`

			`import (`
			`"encoding/json"`
			`"fmt"`
			`"log"`
			`"os"`
			`"strings"`

			`"github.com/gocolly/colly"`
			`)`

			`type job struct {`
			Title string `json:"title"`
			Logo string `json:"logo"`
			Company string `json:"company"`
			Location string `json:"location"`
			Type string `json:"type"`
			Description string `json:"description"`
			Link string `json:"link"`
			Skills skills `json:"skills"`
			`}`

			`type skills struct {`
			React bool `json:"react"`
			Python bool `json:"python"`
			Golang bool `json:"golang"`
			Svelte bool `json:"svelte"`
			Nextjs bool `json:"nextjs"`
			Typescript bool `json:"typescript"`
added tailwind 2024-06-08 19:12:04 +00:00			Tailwind bool `json:"tailwind"`
initial commit 2024-06-08 14:11:40 +00:00			`}`

			`func skillChecker(description string) skills {`
			`return skills{`
			`React: strings.Contains(description, "React"),`
			`Python: strings.Contains(description, "Python"),`
added tailwind 2024-06-08 19:12:04 +00:00			`Golang: strings.Contains(description, "Golang"),`
initial commit 2024-06-08 14:11:40 +00:00			`Svelte: strings.Contains(description, "Svelte"),`
			`Nextjs: strings.Contains(description, "Next.js"),`
			`Typescript: strings.Contains(description, "TypeScript"),`
added tailwind 2024-06-08 19:12:04 +00:00			`Tailwind: strings.Contains(description, "Tailwind"),`
initial commit 2024-06-08 14:11:40 +00:00			`}`
			`}`
fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00			`func jobsToJson(file *os.File) {`
			`// Encode jobs slice to JSON`
			`encoder := json.NewEncoder(file)`
			`encoder.SetIndent("", " ") // Pretty-print with indentation`
			`if err := encoder.Encode(jobs); err != nil {`
			`log.Fatalf("Cannot write to file %q: %s", fName, err)`
			`}`

			`fmt.Println("Job details successfully written to", fName)`
			`}`
initial commit 2024-06-08 14:11:40 +00:00
			`// Slice to store job details`
fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00			`var (`
			`jobs []job`
			`jobCount int`
			`maxJobs int = 30`
			`fName string = "jobs.json"`
			`)`
initial commit 2024-06-08 14:11:40 +00:00
			`func main() {`

			`file, err := os.Create(fName)`
			`if err != nil {`
			`log.Fatalf("Cannot create file %q: %s", fName, err)`
			`}`
			`defer file.Close()`
			`baseUrl := "https://thehub.io"`
			`searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs"`
			`// Instantiate default collector`
			`c := colly.NewCollector(`
			`// visit only the hub`
			`colly.AllowedDomains("www.thehub.io", "thehub.io"),`

			`// Cache responses to prevent multiple requests`
			`colly.CacheDir("./thehub_cache"),`
			`)`

			`// Slice of excluded words in the job titles`
			`excluded := []string{"senior", "lead"}`
			`// Instantiate a new collector to visit the job details page`
			`detailsCollector := c.Clone()`
			`// On every <div> element with class "card__content attribute call callback`
			`c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {`
fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00			`//ensure only scrape the amount of jobs specified`

initial commit 2024-06-08 14:11:40 +00:00			`// Get the title and ensure it doesn't contain any excluded words`
			`title := e.ChildText("span.card-job-find-list__position")`
			`for _, excludedWord := range excluded {`
			`if strings.Contains(strings.ToLower(title), excludedWord) {`
			`return`
			`}`
			`}`
			`link := e.ChildAttr("a", "href")`
			`fullLink := baseUrl + link`

			`detailsCollector.Visit(fullLink)`
fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00
initial commit 2024-06-08 14:11:40 +00:00			`})`

			`detailsCollector.OnRequest(func(r *colly.Request) {`
			`fmt.Println("Visiting", r.URL.String())`
			`})`

fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00			`detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) {`
initial commit 2024-06-08 14:11:40 +00:00
fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00			`// Get logo and trim the url`
initial commit 2024-06-08 14:11:40 +00:00			`logo := e.ChildAttr("div.media-item__image", "style")`
			`cutLeft := "background-image:url("`
			`cutRight := ");"`
			`trimmedLogo := strings.Trim(logo, cutLeft+cutRight)`

			`// Get company name`
			`jobDetails := job{`
			`Title: e.ChildText("h2[class=view-job-details__title]"),`
			`Logo: trimmedLogo,`
			`Company: e.ChildText(".bullet-inline-list > a:first-child"),`
			`Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"),`
			`Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"),`
			`Description: e.ChildText("content.text-block__content > span"),`
			`Link: e.Request.URL.String(),`
			`Skills: skillChecker(e.ChildText("content.text-block__content > span")),`
			`}`
			`jobs = append(jobs, jobDetails)`
fixed issue where it would write the jobs twice 2024-06-08 19:03:19 +00:00			`jobCount++`
			`fmt.Println("Scraped job", jobCount)`
			`if jobCount == maxJobs {`
			`jobsToJson(file)`
			`os.Exit(0)`
			`}`
initial commit 2024-06-08 14:11:40 +00:00			`})`
			`// Handle pagination`
			`c.OnHTML("a.page-link", func(e *colly.HTMLElement) {`
			`nextPage := e.Attr("href")`
			`if nextPage != "" {`
			`fullNextPage := baseUrl + nextPage`
			`fmt.Println("Visiting next page:", fullNextPage)`
			`e.Request.Visit(fullNextPage)`
			`}`
			`})`

			`c.Visit(searchString)`

			`}`