Skip to content

Commit

Permalink
Merge pull request #261 from getmaxun/develop
Browse files Browse the repository at this point in the history
chore: release v0.0.5
  • Loading branch information
amhsirak authored Dec 22, 2024
2 parents 6b1b0d3 + 4ac55f0 commit 825555d
Show file tree
Hide file tree
Showing 55 changed files with 4,431 additions and 787 deletions.
File renamed without changes.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ COPY package*.json ./
COPY maxun-core ./maxun-core

# Install dependencies
RUN npm install
RUN npm install --legacy-peer-deps

# Copy frontend source code and config
COPY src ./src
COPY public ./public
COPY index.html ./
COPY vite.config.js ./
COPY tsconfig.json ./
Expand Down
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@ Maxun lets you train a robot in 2 minutes and scrape the web on auto-pilot. Web
<img src="https://static.scarf.sh/a.png?x-pxid=c12a77cc-855e-4602-8a0f-614b2d0da56a" />

# Installation
1. First, create a file named `.env` in the root folder of the project
2. Example env file can be viewed [here](https://github.com/getmaxun/maxun/blob/master/ENVEXAMPLE). Copy all content of example env to your `.env` file.
3. Choose your installation method below
1. Create a root folder for your project (e.g. 'maxun')
2. Create a file named `.env` in the root folder of the project
3. Example env file can be viewed [here](https://github.com/getmaxun/maxun/blob/master/ENVEXAMPLE). Copy all content of example env to your `.env` file.
4. Choose your installation method below

### Docker Compose
1. Copy paste the [docker-compose.yml file](https://github.com/getmaxun/maxun/blob/master/docker-compose.yml)
2. Ensure you have setup the `.env` file
3. Run the command below
1. Copy paste the [docker-compose.yml file](https://github.com/getmaxun/maxun/blob/master/docker-compose.yml) into your root folder
2. Ensure you have setup the `.env` file in that same folder
3. Run the command below from a terminal
```
docker-compose up -d
```
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ services:
#build:
#context: .
#dockerfile: server/Dockerfile
image: getmaxun/maxun-backend:v0.0.7
image: getmaxun/maxun-backend:v0.0.9
ports:
- "${BACKEND_PORT:-8080}:${BACKEND_PORT:-8080}"
env_file: .env
Expand All @@ -70,7 +70,7 @@ services:
#build:
#context: .
#dockerfile: Dockerfile
image: getmaxun/maxun-frontend:v0.0.3
image: getmaxun/maxun-frontend:v0.0.5
ports:
- "${FRONTEND_PORT:-5173}:${FRONTEND_PORT:-5173}"
env_file: .env
Expand Down
2 changes: 1 addition & 1 deletion maxun-core/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "maxun-core",
"version": "0.0.6",
"version": "0.0.7",
"description": "Core package for Maxun, responsible for data extraction",
"main": "build/index.js",
"typings": "build/index.d.ts",
Expand Down
95 changes: 63 additions & 32 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const scrapedData = [];

while (scrapedData.length < limit) {
// Get all parent elements matching the listSelector
const parentElements = Array.from(document.querySelectorAll(listSelector));

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, window.location.origin).href : null;
} else if (attribute === 'href') {
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, window.location.origin).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
let parentElements = Array.from(document.querySelectorAll(listSelector));

// If we only got one element or none, try a more generic approach
if (limit > 1 && parentElements.length <= 1) {
const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
const container = document.querySelector(containerSelector);

if (container) {
const allChildren = Array.from(container.children);

const firstMatch = document.querySelector(listSelector);
if (firstMatch) {
// Get classes from the first matching element
const firstMatchClasses = Array.from(firstMatch.classList);

// Find similar elements by matching most of their classes
parentElements = allChildren.filter(element => {
const elementClasses = Array.from(element.classList);

// Element should share at least 70% of classes with the first match
const commonClasses = firstMatchClasses.filter(cls =>
elementClasses.includes(cls));
return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
});
}
}
}
}
scrapedData.push(record);
}

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, window.location.origin).href : null;
} else if (attribute === 'href') {
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, window.location.origin).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
}
}
}
scrapedData.push(record);
}

// If we've processed all available elements and still haven't reached the limit,
// break to avoid infinite loop
if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
break;
}
}
return scrapedData
};
return scrapedData;
};


/**
Expand Down
43 changes: 36 additions & 7 deletions maxun-core/src/interpret.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,21 @@ export default class Interpreter extends EventEmitter {

private async applyAdBlocker(page: Page): Promise<void> {
if (this.blocker) {
await this.blocker.enableBlockingInPage(page);
try {
await this.blocker.enableBlockingInPage(page);
} catch (err) {
this.log(`Ad-blocker operation failed:`, Level.ERROR);
}
}
}

private async disableAdBlocker(page: Page): Promise<void> {
if (this.blocker) {
await this.blocker.disableBlockingInPage(page);
try {
await this.blocker.disableBlockingInPage(page);
} catch (err) {
this.log(`Ad-blocker operation failed:`, Level.ERROR);
}
}
}

Expand Down Expand Up @@ -192,8 +200,8 @@ export default class Interpreter extends EventEmitter {
// const actionable = async (selector: string): Promise<boolean> => {
// try {
// const proms = [
// page.isEnabled(selector, { timeout: 5000 }),
// page.isVisible(selector, { timeout: 5000 }),
// page.isEnabled(selector, { timeout: 10000 }),
// page.isVisible(selector, { timeout: 10000 }),
// ];

// return await Promise.all(proms).then((bools) => bools.every((x) => x));
Expand All @@ -214,6 +222,17 @@ export default class Interpreter extends EventEmitter {
// return [];
// }),
// ).then((x) => x.flat());

const presentSelectors: SelectorArray = await Promise.all(
selectors.map(async (selector) => {
try {
await page.waitForSelector(selector, { state: 'attached' });
return [selector];
} catch (e) {
return [];
}
}),
).then((x) => x.flat());

const action = workflowCopy[workflowCopy.length - 1];

Expand All @@ -233,7 +252,7 @@ export default class Interpreter extends EventEmitter {
...p,
[cookie.name]: cookie.value,
}), {}),
selectors,
selectors: presentSelectors,
};
}

Expand Down Expand Up @@ -506,7 +525,11 @@ export default class Interpreter extends EventEmitter {
try {
await executeAction(invokee, methodName, step.args);
} catch (error) {
await executeAction(invokee, methodName, [step.args[0], { force: true }]);
try{
await executeAction(invokee, methodName, [step.args[0], { force: true }]);
} catch (error) {
continue
}
}
} else {
await executeAction(invokee, methodName, step.args);
Expand Down Expand Up @@ -647,7 +670,11 @@ export default class Interpreter extends EventEmitter {
const workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow));

// apply ad-blocker to the current page
await this.applyAdBlocker(p);
try {
await this.applyAdBlocker(p);
} catch (error) {
this.log(`Failed to apply ad-blocker: ${error.message}`, Level.ERROR);
}
const usedActions: string[] = [];
let selectors: string[] = [];
let lastAction = null;
Expand Down Expand Up @@ -767,6 +794,8 @@ export default class Interpreter extends EventEmitter {
public async run(page: Page, params?: ParamType): Promise<void> {
this.log('Starting the workflow.', Level.LOG);
const context = page.context();

page.setDefaultNavigationTimeout(100000);

// Check proxy settings from context options
const contextOptions = (context as any)._options;
Expand Down
48 changes: 24 additions & 24 deletions maxun-core/src/utils/concurrency.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,36 @@
*/
export default class Concurrency {
/**
* Maximum number of workers running in parallel. If set to `null`, there is no limit.
*/
* Maximum number of workers running in parallel. If set to `null`, there is no limit.
*/
maxConcurrency: number = 1;

/**
* Number of currently active workers.
*/
* Number of currently active workers.
*/
activeWorkers: number = 0;

/**
* Queue of jobs waiting to be completed.
*/
* Queue of jobs waiting to be completed.
*/
private jobQueue: Function[] = [];

/**
* "Resolve" callbacks of the waitForCompletion() promises.
*/
* "Resolve" callbacks of the waitForCompletion() promises.
*/
private waiting: Function[] = [];

/**
* Constructs a new instance of concurrency manager.
* @param {number} maxConcurrency Maximum number of workers running in parallel.
*/
* Constructs a new instance of concurrency manager.
* @param {number} maxConcurrency Maximum number of workers running in parallel.
*/
constructor(maxConcurrency: number) {
this.maxConcurrency = maxConcurrency;
}

/**
* Takes a waiting job out of the queue and runs it.
*/
* Takes a waiting job out of the queue and runs it.
*/
private runNextJob(): void {
const job = this.jobQueue.pop();

Expand All @@ -53,12 +53,12 @@ export default class Concurrency {
}

/**
* Pass a job (a time-demanding async function) to the concurrency manager. \
* The time of the job's execution depends on the concurrency manager itself
* (given a generous enough `maxConcurrency` value, it might be immediate,
* but this is not guaranteed).
* @param worker Async function to be executed (job to be processed).
*/
* Pass a job (a time-demanding async function) to the concurrency manager. \
* The time of the job's execution depends on the concurrency manager itself
* (given a generous enough `maxConcurrency` value, it might be immediate,
* but this is not guaranteed).
* @param worker Async function to be executed (job to be processed).
*/
addJob(job: () => Promise<any>): void {
// console.debug("Adding a worker!");
this.jobQueue.push(job);
Expand All @@ -72,11 +72,11 @@ export default class Concurrency {
}

/**
* Waits until there is no running nor waiting job. \
* If the concurrency manager is idle at the time of calling this function,
* it waits until at least one job is completed (can be "presubscribed").
* @returns Promise, resolved after there is no running/waiting worker.
*/
* Waits until there is no running nor waiting job. \
* If the concurrency manager is idle at the time of calling this function,
* it waits until at least one job is completed (can be "presubscribed").
* @returns Promise, resolved after there is no running/waiting worker.
*/
waitForCompletion(): Promise<void> {
return new Promise((res) => {
this.waiting.push(res);
Expand Down
8 changes: 6 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "maxun",
"version": "0.0.4",
"version": "0.0.5",
"author": "Maxun",
"license": "AGPL-3.0-or-later",
"dependencies": {
Expand Down Expand Up @@ -36,14 +36,17 @@
"fortawesome": "^0.0.1-security",
"google-auth-library": "^9.14.1",
"googleapis": "^144.0.0",
"i18next": "^24.0.2",
"i18next-browser-languagedetector": "^8.0.0",
"i18next-http-backend": "^3.0.1",
"idcac-playwright": "^0.1.3",
"ioredis": "^5.4.1",
"joi": "^17.6.0",
"jsonwebtoken": "^9.0.2",
"jwt-decode": "^4.0.0",
"loglevel": "^1.8.0",
"loglevel-plugin-remote": "^0.6.8",
"maxun-core": "^0.0.6",
"maxun-core": "^0.0.7",
"minio": "^8.0.1",
"moment-timezone": "^0.5.45",
"node-cron": "^3.0.3",
Expand All @@ -57,6 +60,7 @@
"react": "^18.0.0",
"react-dom": "^18.0.0",
"react-highlight": "0.15.0",
"react-i18next": "^15.1.3",
"react-router-dom": "^6.26.1",
"react-simple-code-editor": "^0.11.2",
"react-transition-group": "^4.4.2",
Expand Down
Loading

0 comments on commit 825555d

Please sign in to comment.