forked from rannes.dev/sw-jobs-scraper
		
	Compare commits
	
		
			No commits in common. "5fbdd9706e4188f44398e96d47f8da9b0f7032da" and "c20690f3238a4009ca5138b61c435f6a7b0d71ba" have entirely different histories.
		
	
	
		
			5fbdd9706e
			...
			c20690f323
		
	
		
							
								
								
									
										49
									
								
								scraper.go
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								scraper.go
									
									
									
									
									
								
							| @ -4,7 +4,6 @@ import ( | |||||||
| 	"context" | 	"context" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"strings" | 	"strings" | ||||||
| 	"time" |  | ||||||
| 
 | 
 | ||||||
| 	"github.com/aws/aws-lambda-go/lambda" | 	"github.com/aws/aws-lambda-go/lambda" | ||||||
| 	"github.com/gocolly/colly" | 	"github.com/gocolly/colly" | ||||||
| @ -30,13 +29,6 @@ type skills struct { | |||||||
| 	Typescript bool `json:"typescript"` | 	Typescript bool `json:"typescript"` | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| var ( |  | ||||||
| 	jobs      []job |  | ||||||
| 	lastFetch time.Time |  | ||||||
| 	cacheTTL  = time.Minute * 5 |  | ||||||
| 	jobLimit  = 20 |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| func skillChecker(description string) skills { | func skillChecker(description string) skills { | ||||||
| 	return skills{ | 	return skills{ | ||||||
| 		React:      strings.Contains(description, "React"), | 		React:      strings.Contains(description, "React"), | ||||||
| @ -48,9 +40,13 @@ func skillChecker(description string) skills { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func fetchData() error { | // Slice to store job details | ||||||
|  | var jobs []job | ||||||
|  | 
 | ||||||
|  | func handler(ctx context.Context) ([]job, error) { | ||||||
| 
 | 
 | ||||||
| 	baseUrl := "https://thehub.io" | 	baseUrl := "https://thehub.io" | ||||||
|  | 	searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" | ||||||
| 	// Instantiate default collector | 	// Instantiate default collector | ||||||
| 	c := colly.NewCollector( | 	c := colly.NewCollector( | ||||||
| 		// visit only the hub | 		// visit only the hub | ||||||
| @ -64,16 +60,8 @@ func fetchData() error { | |||||||
| 	excluded := []string{"senior", "lead"} | 	excluded := []string{"senior", "lead"} | ||||||
| 	// Instantiate a new collector to visit the job details page | 	// Instantiate a new collector to visit the job details page | ||||||
| 	detailsCollector := c.Clone() | 	detailsCollector := c.Clone() | ||||||
| 
 |  | ||||||
| 	// Limit the number of jobs to fetch |  | ||||||
| 	jobCount := 0 |  | ||||||
| 
 |  | ||||||
| 	// On every <div> element with class "card__content attribute call callback | 	// On every <div> element with class "card__content attribute call callback | ||||||
| 	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { | 	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { | ||||||
| 		// Return if the job limit has been reached |  | ||||||
| 		if jobCount >= jobLimit { |  | ||||||
| 			return |  | ||||||
| 		} |  | ||||||
| 		// Get the title and ensure it doesn't contain any excluded words | 		// Get the title and ensure it doesn't contain any excluded words | ||||||
| 		title := e.ChildText("span.card-job-find-list__position") | 		title := e.ChildText("span.card-job-find-list__position") | ||||||
| 		for _, excludedWord := range excluded { | 		for _, excludedWord := range excluded { | ||||||
| @ -92,9 +80,6 @@ func fetchData() error { | |||||||
| 	}) | 	}) | ||||||
| 
 | 
 | ||||||
| 	detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) { | 	detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) { | ||||||
| 		if jobCount >= jobLimit { |  | ||||||
| 			return |  | ||||||
| 		} |  | ||||||
| 		// Get logo and trim the url | 		// Get logo and trim the url | ||||||
| 
 | 
 | ||||||
| 		logo := e.ChildAttr("div.media-item__image", "style") | 		logo := e.ChildAttr("div.media-item__image", "style") | ||||||
| @ -114,7 +99,6 @@ func fetchData() error { | |||||||
| 			Skills:      skillChecker(e.ChildText("content.text-block__content > span")), | 			Skills:      skillChecker(e.ChildText("content.text-block__content > span")), | ||||||
| 		} | 		} | ||||||
| 		jobs = append(jobs, jobDetails) | 		jobs = append(jobs, jobDetails) | ||||||
| 		jobCount++ |  | ||||||
| 	}) | 	}) | ||||||
| 	// Handle pagination | 	// Handle pagination | ||||||
| 	c.OnHTML("a.page-link", func(e *colly.HTMLElement) { | 	c.OnHTML("a.page-link", func(e *colly.HTMLElement) { | ||||||
| @ -125,29 +109,8 @@ func fetchData() error { | |||||||
| 			e.Request.Visit(fullNextPage) | 			e.Request.Visit(fullNextPage) | ||||||
| 		} | 		} | ||||||
| 	}) | 	}) | ||||||
| 	// Visit the initial URL to start scraping |  | ||||||
| 	err := c.Visit("https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&search=developer&paid=true&countryCode=DK&sorting=newJobs") |  | ||||||
| 	if err != nil { |  | ||||||
| 		return err |  | ||||||
| 	} |  | ||||||
| 	return nil |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| func handler(ctx context.Context) ([]job, error) { |  | ||||||
| 	// Check if cache is valid |  | ||||||
| 	if time.Since(lastFetch) < cacheTTL && len(jobs) > 0 { |  | ||||||
| 		return jobs, nil |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	// Fetch new data |  | ||||||
| 	err := fetchData() |  | ||||||
| 	if err != nil { |  | ||||||
| 		return nil, err |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	// Update cache timestamp |  | ||||||
| 	lastFetch = time.Now() |  | ||||||
| 
 | 
 | ||||||
|  | 	c.Visit(searchString) | ||||||
| 	return jobs, nil | 	return jobs, nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user