Skip to content

Commit

Permalink
Updated selector
Browse files Browse the repository at this point in the history
  • Loading branch information
marcomontalbano committed Aug 12, 2017
1 parent 057df39 commit 9cbf1d4
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 14 deletions.
80 changes: 80 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,83 @@
HTML Miner
==========

[![Build Status](https://travis-ci.org/marcomontalbano/html-miner.svg?branch=master)](https://travis-ci.org/marcomontalbano/html-miner)


Install
-------

```sh
# using yarn
yarn add html-miner

# using npm
npm i --save html-miner
```


Example
-------

We have following html snippet and we want to fetch the `title`.

```html
<div class="jumbotron">
<div class="container">
<h1 class="display-3">Hello, world!</h1>
<p>This is a template for a simple marketing or informational website. It includes a large callout called a jumbotron and three supporting pieces of content. Use it as a starting point to create something more unique.</p>
<p><a class="btn btn-primary btn-lg" href="#" role="button">Learn more &raquo;</a></p>
</div>
</div>
<div class="container">
<div class="row">
<div class="col-md-4">
<h2>Heading</h2>
<p>Donec id elit non mi porta gravida at eget metus. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus. Etiam porta sem malesuada magna mollis euismod. Donec sed odio dui. </p>
<p><a class="btn btn-secondary" href="#" role="button">View details &raquo;</a></p>
</div>
<div class="col-md-4">
<h2>Heading</h2>
<p>Donec id elit non mi porta gravida at eget metus. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus. Etiam porta sem malesuada magna mollis euismod. Donec sed odio dui. </p>
<p><a class="btn btn-secondary" href="#" role="button">View details &raquo;</a></p>
</div>
<div class="col-md-4">
<h2>Heading</h2>
<p>Donec sed odio dui. Cras justo odio, dapibus ac facilisis in, egestas eget quam. Vestibulum id ligula porta felis euismod semper. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus.</p>
<p><a class="btn btn-secondary" href="#" role="button">View details &raquo;</a></p>
</div>
</div>

<hr>

<footer>
<p>&copy; Company 2017</p>
</footer>
</div>
```

```javascript
const htmlMiner = require('html-miner');

let json = htmlMiner(html, {
title : 'h1',
headings : 'h2',
greet : $ => { return 'Hi!' }
});

console.log( json );
// {
// title : 'Hello, world!',
// headings : ['Heading', 'Heading', 'Heading'],
// greet : 'Hi!'
// }
```


Development
-----------

```sh
yarn
yarn test
```
31 changes: 26 additions & 5 deletions lib/index.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,36 @@
'use strict';

const cheerio = require('cheerio');
const _ = require('lodash');

module.exports = (html, selectors) => {
module.exports = (html, originalSelector) => {

if ( ! _.isString( originalSelector ) && ! _.isArrayLike( originalSelector ) && ! _.isObjectLike( originalSelector ) ) {
throw new Error("'selector' must be string, array or object");
}

const $ = cheerio.load(html);

let selector = _.isString(originalSelector) ? {default:originalSelector} : originalSelector;

let elements = [];
$( selectors ).each((i, el) => {
elements.push( $(el).text() );
});
let data = _.isArrayLike(originalSelector) ? [] : {};
_.each(selector, (value, key) => {

return elements.length > 1 ? elements : elements[0];
if ( _.isFunction( value ) ) {
elements.push( value.apply(this, [$, data]) );
}

if ( _.isString( value ) ) {
$( value ).each((i, el) => {
elements.push( $(el).text().replace(/\s+\n+\s+/g, "\n").trim() );
});
}

data[key] = elements.length > 1 ? elements : elements[0];
elements = [];

});

return _.isString(originalSelector) ? data.default : data;
};
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "html-miner",
"version": "0.0.1-beta-1",
"version": "0.0.1-beta-2",
"description": "A powerful miner who will scrape html pages for you. ` HTML Scraper ´",
"main": "index.js",
"repository": "https://github.com/marcomontalbano/html-miner.git",
Expand All @@ -21,7 +21,8 @@
"scrape-html"
],
"dependencies": {
"cheerio": "^1.0.0-rc.2"
"cheerio": "^1.0.0-rc.2",
"lodash": "^4.17.4"
},
"devDependencies": {
"mocha": "^3.5.0"
Expand Down
83 changes: 77 additions & 6 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,85 @@ describe('htmlMiner', () => {
});
})

it('should return a string for a tag (e.g. <h1>)', () => {
let text = htmlMiner(html, 'h1');
assert.equal(text, 'Hello, world!');
//
describe('should throw an exception', () => {

it('given a number', () => {
assert.throws(() => { htmlMiner(html, 4) });
});

it('given a boolean', () => {
assert.throws(() => { htmlMiner(html, true) });
});

});

//
describe('should returns a string', () => {

it('given a string (e.g. \'h1\')', () => {
let actual = htmlMiner(html, 'h1');
assert.equal(actual, 'Hello, world!');
});

it('given a string (e.g. \'.nav-item.active > a\')', () => {
let actual = htmlMiner(html, '.nav-item.active > a');
assert.equal(actual, 'Home (current)');
});

});

//
describe('should returns an array', () => {

it('given a string (e.g. \'h2\')', () => {
let actual = htmlMiner(html, 'h2');
assert.deepStrictEqual(actual, Array(3).fill('Heading'));
});

it('given an array', () => {
let actual = htmlMiner(html, ['h1', 'h2']);

assert.deepStrictEqual(actual, [
'Hello, world!',
Array(3).fill('Heading')
]);
});

});

it('should return an array of strings for a tag (e.g. <h2>)', () => {
let text = htmlMiner(html, 'h2');
assert.deepEqual(text, Array(3).fill('Heading'));
//
describe('should returns an object', () => {

it('given an object', () => {
let actual = htmlMiner(html, {
title : 'h1',
headings : 'h2'
});

assert.deepStrictEqual(actual, {
title : 'Hello, world!',
headings : Array(3).fill('Heading')
});
});

});

//
describe('given an object', () => {

it('should execute the defined callback', () => {
let actual = htmlMiner(html, {
greet : ($, previousData) => {
return 'Hello, world!';
},
});

assert.deepStrictEqual(actual, {
greet : 'Hello, world!'
});
});

});

});
2 changes: 1 addition & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ lodash.keys@^3.0.0:
lodash.isarguments "^3.0.0"
lodash.isarray "^3.0.0"

lodash@^4.15.0:
lodash@^4.15.0, lodash@^4.17.4:
version "4.17.4"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.4.tgz#78203a4d1c328ae1d86dca6460e369b57f4055ae"

Expand Down

0 comments on commit 9cbf1d4

Please sign in to comment.