diff --git a/.gitignore b/.gitignore index e4e0ac0..b1d42ed 100644 --- a/.gitignore +++ b/.gitignore @@ -128,4 +128,7 @@ dmypy.json # Pyre type checker .pyre/ -.idea \ No newline at end of file +.idea + +# Yelp files +*.json \ No newline at end of file diff --git a/README.md b/README.md index f60c264..c14a93f 100644 --- a/README.md +++ b/README.md @@ -4,29 +4,64 @@ Contains Python scripts to import and model the Yelp challenge dataset into Neo4 ## Getting Started -### Step 1: +### Step 1: Getting Neo4j Community Edition -Download the [Neo4j Community Edition](https://neo4j.com/download-thanks/?edition=community&release=4.0.1&flavour=unix) ZIP or tarball or start the Neo4j Docker container with the `docker-compose.yml` file. +Download the [Neo4j Community Edition](https://neo4j.com/download-thanks/?edition=community&release=4.0.1&flavour=unix) +ZIP or tarball or start the Neo4j Docker container with the `docker-compose.yml` file. If using the ZIP or tarball, extract the archive to a directory e.g. `$HOME`: +```bash +$ tar -xvzf neo4j-community-4.0.1-unix.tar.gz -C ~/. ``` -tar -xvzf neo4j-community-4.0.1-unix.tar.gz -C ~/. -``` -### Step 2: +### Step 2: Server Plugins and Configuration +The `neo4j-community.x.x.x` directory (where `x.x.x` would be the version you are using) in this project contains details +on the files that need to be changed on the server. Note, the changes must be made on your server files and not on this +project! This project's directory is simply a demonstration. + +The following files must be changed on the server: + +* `plugins/apoc-x.x.x.x-all.jar`: Download the latest [APOC plugin](https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases) and place it under the plugins directory on your server. +* `conf/neo4j.conf`: This has been configured to whitelist the APOC functions we use in the import process. +* `import/{business, review, user}.json`: All Yelp files one wishes to import must be placed here. This script only +considers the three JSON files listed here. + +### Step 3: Starting Neo4j If not using the Docker image, start Neo4j using the `neo4j` binary in the `neo4j-community-4.0.1/bin` file. Example: +```bash +$ ~/neo4j-community-4.0.1/bin/neo4j start ``` -~/neo4j-community-4.0.1/bin/neo4j start +Optionally, one can set their `~/.profile`, `~/.bashrc`, or `~/.zshrc` etc. to add Neo4j binaries to their path by adding the following line: +```bash +export PATH=/home/david/neo4j-community-4.0.1/bin:$PATH +``` +Then one can simply use: +```bash +$ neo4j start ``` Note this needs to run with Oracle Java 11 or OpenJDK 11. I recommend using [AdoptOpenJDK 11](https://adoptopenjdk.net/installation.html?variant=openjdk11&jvmVariant=hotspot) and setting $JAVA_HOME to the location of the directory e.g. `export JAVA_HOME=/home/david/Downloads/jdk-11.0.5+10`. Neo4j browser should now be running on `http://localhost:7474`. Default username and password is `neo4j` and `neo4j` respectively. -### Step 3: +### Step 4: Python Dependencies Before running the `neo4j_yelp.py` script, make sure that you have installed all of the dependencies and edited `config.py` to contain your credentials. To download all the dependencies you can simply type: -``` -pip3 install -r requirements.txt --user +```bash +$ pip3 install -r requirements.txt --user ``` +### Step 5: Import the Dataset + +Now that everything is configured and ready, the import script can be run with: +```bash +$ python3 neo4j_yelp.py +``` +If all goes well, you will see the following output from the terminal: +```bash +[INFO] Clearing graph of any existing data +[INFO] Asserting schema +[INFO] Loading businesses +[INFO] Loading users +[INFO] Loading reviews +``` \ No newline at end of file diff --git a/neo4j_yelp.py b/neo4j_yelp.py index 3398784..4c8e446 100644 --- a/neo4j_yelp.py +++ b/neo4j_yelp.py @@ -5,4 +5,44 @@ graph = Graph(uri) -print(graph.evaluate("MATCH (tom {name: \"Tom Hanks\"}) RETURN tom")) +print("[INFO] Clearing graph of any existing data") +graph.evaluate("MATCH (n) DETACH DELETE n") + +print("[INFO] Asserting schema") +graph.evaluate("CALL apoc.schema.assert({Category:['name']},{Business:['id'],User:['id'],Review:['id']})") + +print("[INFO] Loading businesses") +graph.evaluate('CALL apoc.periodic.iterate("' + 'CALL apoc.load.json(\'file:///business.json\') YIELD value RETURN value ' + '"," ' + 'MERGE (b:Business{id:value.business_id}) ' + 'SET b += apoc.map.clean(value, [\'business_id\',\'categories\',\'address\',\'postal_code\'],[]) ' + 'WITH b,value.categories as categories ' + 'UNWIND categories as category ' + 'MERGE (c:Category{id:category}) ' + 'MERGE (b)-[:IN_CATEGORY]->(c)"' + ',{batchSize: 10000, iterateList: true});') + +print("[INFO] Loading users") +graph.evaluate('CALL apoc.periodic.iterate("' + 'CALL apoc.load.json(\'file:///user.json\') ' + 'YIELD value RETURN value ' + '"," ' + 'MERGE (u:User{id:value.user_id}) ' + 'SET u += apoc.map.clean(value, [\'friends\',\'user_id\'],[0]) ' + 'WITH u,value.friends as friends ' + 'UNWIND friends as friend ' + 'MERGE (u1:User{id:friend}) ' + 'MERGE (u)-[:FRIEND]-(u1) ' + '",{batchSize: 100, iterateList: true});') + +print("[INFO] Loading reviews") +graph.evaluate('CALL apoc.periodic.iterate("' + 'CALL apoc.load.json(\'file:///review.json\') ' + 'YIELD value RETURN value ' + '"," ' + 'MERGE (b:Business{id:value.business_id}) ' + 'MERGE (u:User{id:value.user_id}) ' + 'MERGE (u)-[r:REVIEWS]->(b) ' + 'SET r += apoc.map.clean(value, [\'business_id\',\'user_id\',\'review_id\'],[0])' + '",{batchSize: 10000, iterateList: true});') diff --git a/requirements.txt b/requirements.txt index 26b0ce9..945253d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -py2neo \ No newline at end of file +py2neo==4.3.0 \ No newline at end of file