sw-jobs-api/scrapers/jobindex.py

import requests
from bs4 import BeautifulSoup
import json
import os
from datetime import datetime, timedelta

# Base URL of the job listings page for software development jobs
base_url = 'https://www.jobindex.dk/jobsoegning/it/'

# Keywords to include in the job listings
include_keywords = ['nextjs', 'svelte', 'react', 'junior', 'python', 'typescript']

# Keywords to exclude from the job titles
exclude_keywords = ['senior']

# File path for the JSON database
db_file = 'jobs_db.json'

# Load existing jobs from the JSON file if it exists
if os.path.exists(db_file):
    try:
        with open(db_file, 'r') as file:
            existing_jobs = json.load(file)
            print(f"Loaded {len(existing_jobs)} existing jobs from {db_file}")
    except json.JSONDecodeError:
        print("Error reading JSON file. Starting with an empty list.")
        existing_jobs = []
else:
    print("JSON file not found. Starting with an empty list.")
    existing_jobs = []

# Convert existing jobs to a set of IDs for duplicate checking
existing_job_ids = set(job['id'] for job in existing_jobs)

# Function to fetch and parse a page
def fetch_jobs_from_page(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to load page {url}")
    return BeautifulSoup(response.content, 'html.parser')

# Function to extract jobs from the parsed HTML
def extract_jobs(soup):
    job_listings = soup.find_all('div', class_='jobsearch-result')
    jobs = []
    today = datetime.today()
    for job in job_listings:
        title_element = job.find('h4').find('a')
        company_element = job.find('div', class_='jix-toolbar-top__company')
        location_element = job.find('span', class_='jix_robotjob--area')
        date_posted_element = job.find('time')
        job_link_element = job.find('h4').find('a')
        description_element = job.find('p')

        title = title_element.get_text(strip=True) if title_element else ''
        url = company_element.find('a', rel='noopener')['href'] if company_element and company_element.find('a', rel='noopener') else ''
        img = f"https://www.jobindex.dk{job.find('img')['src']}" if job.find('img') else ''
        company = company_element.find('a').get_text(strip=True) if company_element else ''
        location = location_element.get_text(strip=True) if location_element else ''
        date_posted = date_posted_element['datetime'] if date_posted_element else ''
        job_link = job_link_element['href'] if job_link_element else ''
        description = description_element.get_text(strip=True) if description_element else ''

        print(f"Debug: title={title}, url={url}, img={img}, company={company}, location={location}, date_posted={date_posted}, job_link={job_link}, description={description}")  # Detailed debug print

        # Convert date_posted to datetime object
        try:
            date_posted_dt = datetime.strptime(date_posted, '%Y-%m-%d')
            if (today - date_posted_dt).days > 3:
                print("Job older than 3 days found. Stopping the scraper.")
                return jobs, False  # Returning jobs and False to indicate stopping
        except ValueError:
            continue

        combined_text = f"{title} {description}".lower()
        if not any(exclude_keyword in title.lower() for exclude_keyword in exclude_keywords) and any(include_keyword in combined_text for include_keyword in include_keywords):
            job_id = f"{title}-{url}-{location}-{date_posted}"
            if job_id not in existing_job_ids:
                jobs.append({
                    'id': job_id,
                    'title': title,
                    'url': url,
                    'img': img,
                    'company': company,
                    'location': location,
                    'date_posted': date_posted,
                    'link': job_link
                })
                existing_job_ids.add(job_id)
                print(f"Added job: {job_id}")  # Debug print for each added job

    return jobs, True

# Function to find the next page URL
def get_next_page_url(soup):
    next_page = soup.find('a', {'aria-label': 'Næste'})
    return next_page['href'] if next_page else None

# Main scraping loop
current_url = base_url
all_jobs = []

while current_url:
    print(f"Fetching jobs from: {current_url}")
    soup = fetch_jobs_from_page(current_url)
    jobs, continue_scraping = extract_jobs(soup)
    all_jobs.extend(jobs)
    print(f"Collected {len(jobs)} jobs from this page.")
    if not continue_scraping:
        break
    current_url = get_next_page_url(soup)

# Combine the existing jobs with the new jobs
all_jobs = existing_jobs + all_jobs

# Remove jobs older than 30 days from the combined list
cutoff_date = datetime.today() - timedelta(days=30)
all_jobs = [job for job in all_jobs if datetime.strptime(job['date_posted'], '%Y-%m-%d') >= cutoff_date]

# Final debug print before saving
print(f"Total jobs to be saved: {len(all_jobs)}")
print(f"Jobs to be saved: {all_jobs}")  # Debug print to show jobs to be saved

# Save the new jobs to the JSON file
with open(db_file, 'w') as file:
    json.dump(all_jobs, file, indent=4)

print(f"Total jobs saved: {len(all_jobs)}")  # Final output after saving to file