Source

utils/parse.js

const cheerio = require('cheerio');
const Papa = require('papaparse');

/**
 * Fetches HTML content from the specified URL, parses it using Cheerio, and extracts links with the target class.
 *
 * @param {string} url - The URL to fetch HTML content from.
 * @param {string} targetClass - The class of the elements to select and extract links.
 * @returns {Promise<Array<string>>} A promise that resolves to an array of links extracted from the HTML content.
 *
 * @example
 * const url = 'https://example.com';
 * const targetClass = 'link-class';
 * const links = await fetchAndParseHTML(url, targetClass);
 * console.log('Extracted Links:', links);
 *
 * @category Utils
 */
function fetchAndParseHTML(url, targetClass) {
  return fetch(url)
    .then(response => response.text())
    .then(html => {
      const $ = cheerio.load(html);

      // Parse HTML content using Cheerio
      // Select elements with the specified class
      const elements = $(`a.${targetClass}`);
      const formats = $(`.heading span.format-label`);

      const links = [];

      // Extract links based on the specified class and data-format attribute
      elements.each((index, element) => {
        if (formats[index].attribs['data-format'] === 'csv') {
          links.push(element.attribs.href);
        }
      });

      return links;
    })
    .catch(error => {
      console.error('Error fetching page:', error);
      throw error;
    });
}

/**
 * Fetches and parses CSV data from the specified URL using the PapaParse library.
 *
 * @param {string} url - The URL to fetch CSV data from.
 * @returns {Promise<Object>} A promise that resolves to an object representing the parsed CSV data.
 *
 * @example
 * const url = 'https://example.com/data.csv';
 * const csvData = await fetchAndParseCSV(url);
 * console.log('Parsed CSV Data:', csvData);
 *
 * @category Utils
 */
async function fetchAndParseCSV(url) {
  const headers = new Headers();
  headers.append('Content-Type', 'text/plain;');

  const csvData = await fetch(url, headers)
    .then(response => response.arrayBuffer())
    .then(buffer => {
      const decoder = new TextDecoder('iso-8859-2');
      const text = decoder.decode(buffer);
      return text;
    });

  const options = {
    header: true,
    encoding: 'UTF-8',
  };

  return Papa.parse(csvData, options);
}

module.exports = {
  fetchAndParseHTML,
  fetchAndParseCSV
};