diff --git a/.browserslistrc b/.browserslistrc new file mode 100644 index 0000000..afe4650 --- /dev/null +++ b/.browserslistrc @@ -0,0 +1,5 @@ +# https://github.com/browserslist/browserslist#browserslistrc + +last 2 versions +> 0.2% +not dead diff --git a/.commitlintrc.json b/.commitlintrc.json new file mode 100644 index 0000000..5bed7cb --- /dev/null +++ b/.commitlintrc.json @@ -0,0 +1,8 @@ +{ + "rules": { + "body-max-line-length": [ + 0, + "always" + ] + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..2b740bf --- /dev/null +++ b/.editorconfig @@ -0,0 +1,19 @@ +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true +# Unix-style newlines with a newline ending every file +end_of_line = lf +insert_final_newline = true + +[*.{js,css,scss}] +quote_type = single + +[*.{yml,yaml}] +quote_type = double + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..262d6bd --- /dev/null +++ b/.gitattributes @@ -0,0 +1,16 @@ +# Set default behavior to automatically normalize line endings. +* text=auto + +# Force bash scripts to always use LF line endings so that if a repo is accessed +# in Unix via a file share from Windows, the scripts will work. +*.sh text eol=lf + +# Force batch scripts to always use CRLF line endings so that if a repo is accessed +# in Windows via a file share from Linux, the scripts will work. +*.{cmd,[cC][mM][dD]} text eol=crlf +*.{bat,[bB][aA][tT]} text eol=crlf + +# Denote all files that are truly binary and should not be modified. +*.png binary +*.jpg binary +*.ico binary diff --git a/.github/workflows/pages-deploy.yml b/.github/workflows/pages-deploy.yml new file mode 100644 index 0000000..1dc75e8 --- /dev/null +++ b/.github/workflows/pages-deploy.yml @@ -0,0 +1,71 @@ +name: "Build and Deploy" +on: + push: + branches: + - main + - master + paths-ignore: + - .gitignore + - README.md + - LICENSE + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +# Allow one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + # submodules: true + # If using the 'assets' git submodule from Chirpy Starter, uncomment above + # (See: https://github.com/cotes2020/chirpy-starter/tree/main/assets) + + - name: Setup Pages + id: pages + uses: actions/configure-pages@v3 + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.2 # reads from a '.ruby-version' or '.tools-version' file if 'ruby-version' is omitted + bundler-cache: true + + - name: Build site + run: bundle exec jekyll b -d "_site${{ steps.pages.outputs.base_path }}" + env: + JEKYLL_ENV: "production" + + - name: Test site + run: | + bundle exec htmlproofer _site --disable-external --check-html --allow_hash_href + + - name: Upload site artifact + uses: actions/upload-pages-artifact@v1 + with: + path: "_site${{ steps.pages.outputs.base_path }}" + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..267d370 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# Bundler cache +.bundle +vendor +Gemfile.lock + +# Jekyll cache +.jekyll-cache +_site + +# RubyGems +*.gem + +# NPM dependencies +node_modules +package-lock.json + +# IDE configurations +.idea +.vscode + +# Misc diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..58062c5 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "assets/lib"] + path = assets/lib + url = https://github.com/cotes2020/chirpy-static-assets.git diff --git a/.husky/commit-msg b/.husky/commit-msg new file mode 100755 index 0000000..4037788 --- /dev/null +++ b/.husky/commit-msg @@ -0,0 +1,4 @@ +#!/bin/sh +. "$(dirname "$0")/_/husky.sh" + +npx --no -- commitlint -x $(npm root -g)/@commitlint/config-conventional --edit diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..36b3563 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,3 @@ +{ + "trailingComma": "none" +} diff --git a/.stylelintrc.json b/.stylelintrc.json new file mode 100644 index 0000000..f489fee --- /dev/null +++ b/.stylelintrc.json @@ -0,0 +1,23 @@ +{ + "extends": "stylelint-config-standard-scss", + "rules": { + "no-descending-specificity": null, + "shorthand-property-no-redundant-values": null, + "at-rule-no-vendor-prefix": null, + "property-no-vendor-prefix": null, + "selector-no-vendor-prefix": null, + "value-no-vendor-prefix": null, + "color-function-notation": "legacy", + "alpha-value-notation": "number", + "selector-not-notation": "simple", + "color-hex-length": "long", + "declaration-block-single-line-max-declarations": 3, + "scss/operator-no-newline-after": null, + "rule-empty-line-before": [ + "always", + { "ignore": ["after-comment", "first-nested", "inside-block"] } + ], + "value-keyword-case": ["lower", { "ignoreProperties": ["/^\\$/"] }], + "media-feature-range-notation": "prefix" + } +} diff --git a/.versionrc.json b/.versionrc.json new file mode 100644 index 0000000..4b880d3 --- /dev/null +++ b/.versionrc.json @@ -0,0 +1,20 @@ +{ + "skip": { + "commit": true, + "tag": true + }, + "types": [ + { + "type": "feat", + "section": "Features" + }, + { + "type": "fix", + "section": "Bug Fixes" + }, + { + "type": "perf", + "section": "Improvements" + } + ] +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e9c5bd6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,245 @@ +# Changelog + +All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. + +## [6.1.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v6.0.0...v6.1.0) (2023-07-02) + +### Features + +* **i18n:** add Thai locale file ([#1087](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1087)) ([a60e907](https://github.com/cotes2020/jekyll-theme-chirpy/commit/a60e90791d24811caff78e21c71dc85d6a729438)) + +### Bug Fixes + +* missing xml escape for `alt` of preview image ([#1113](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1113)) ([8b0fbf5](https://github.com/cotes2020/jekyll-theme-chirpy/commit/8b0fbf5a834276f273274e4d614edd71e339cbb0)) +* the cached image is covered by shimmer ([#1100](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1100)) ([df8ff54](https://github.com/cotes2020/jekyll-theme-chirpy/commit/df8ff546ec1c8d21a3d25e0124665001fcf756f3)) +* **ui:** min-height of `page` layout exceeds the mobile screen ([73af591](https://github.com/cotes2020/jekyll-theme-chirpy/commit/73af59194ab935d38b89d298fea0e96e13be7cb7)) +* **webfont:** resume semi-bold of font family `Source Sans Pro` ([c4da99c](https://github.com/cotes2020/jekyll-theme-chirpy/commit/c4da99c7ea5d6e32b1f1b815d7d8d6ae7b0f55de)) + +### Improvements + +* **build:** use `jekyll-include-cache` plugin to reduce build time ([#1098](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1098)) ([4fe145e](https://github.com/cotes2020/jekyll-theme-chirpy/commit/4fe145e9809ee1b370d9891135939534751462d0)), closes [#1094](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1094) +* CJK characters of the "Search Cancel" button will wrap ([#1105](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1105)) ([b6d1992](https://github.com/cotes2020/jekyll-theme-chirpy/commit/b6d1992f85ec543220e826087dcc89870e7e2c00)) +* **ui:** avoid blank space at the bottom of the homepage preview image ([ce2f6f5](https://github.com/cotes2020/jekyll-theme-chirpy/commit/ce2f6f5abef7a8b874e08d1f18c1fd002650dbf1)) +* **ui:** improve hover color of sidebar nav items in light mode ([728094d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/728094d1ba67a1e7c0a11e1c6c69bf87af9a767b)) + +## [6.0.1](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v6.0.0...v6.0.1) (2023-05-19) + +### Bug Fixes + +* **home:** preview image missing `[alt]` and `img_path` ([#1044](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1044)) ([aba9468](https://github.com/cotes2020/jekyll-theme-chirpy/commit/aba9468b5332802db961166889d4c4a84e404a2c)) +* **layout:** restore the margin bottom of the main area ([#1047](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1047)) ([eb40f51](https://github.com/cotes2020/jekyll-theme-chirpy/commit/eb40f51c84b011a7c301279527f544ad27efd5eb)) +* **post, page:** image link loses shimmer effect ([#1046](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1046)) ([3bd881d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/3bd881da70d685d10659f47bfe0e79cd02e7af92)) +* **typography:** long string for update-list is not truncated ([#1050](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1050)) ([a51d31c](https://github.com/cotes2020/jekyll-theme-chirpy/commit/a51d31c55a37fbe034f0b0f699f4df0b6a14ba8f)), closes [#1049](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1049) + +## [6.0.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.6.1...v6.0.0) (2023-05-16) + +### ⚠ BREAKING CHANGES + +* rename assets origin configuration files + +### Features + +* add a hook to insert custom metadata in `head` tag ([#1015](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1015)) ([fe20341](https://github.com/cotes2020/jekyll-theme-chirpy/commit/fe203417d993508eedf5b9044fe53c4a566e44f9)) +* **i18n:** add sl-SI.yml with slovenian translations ([#989](https://github.com/cotes2020/jekyll-theme-chirpy/issues/989)) ([42a700a](https://github.com/cotes2020/jekyll-theme-chirpy/commit/42a700aa37889faa32d7ec1f6776ce4b9d845dc4)) +* **i18n:** add Traditional Chinese (Taiwan) localization file ([#961](https://github.com/cotes2020/jekyll-theme-chirpy/issues/961)) ([d97f95f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/d97f95fca0bcd450ea50709ffba0217f7e65d339)) +* **i18n:** added Swedish localization file ([#969](https://github.com/cotes2020/jekyll-theme-chirpy/issues/969)) ([fe70479](https://github.com/cotes2020/jekyll-theme-chirpy/commit/fe7047959e3694c6e603e764ded30dacd49e6aa9)) +* support hiding the modification date of a post ([#1020](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1020)) ([8da583d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/8da583d403456f6460ec1a6ebcbb0c2ca8127ff6)) +* **ui:** improve code snippet design ([6d99f5c](https://github.com/cotes2020/jekyll-theme-chirpy/commit/6d99f5cc36a69e5ccff51f81ba448c798d92e12e)) +* **ui:** improve the design for top bar ([83f1c34](https://github.com/cotes2020/jekyll-theme-chirpy/commit/83f1c34f92d85f3953ca9c9818be5399962bf1c9)) +* **ui:** new design footer content layout ([3210c59](https://github.com/cotes2020/jekyll-theme-chirpy/commit/3210c59466150dc04b4e4bdfc1ffd0e38adcff43)) +* **ui:** redesign the sidebar ([83bbe4a](https://github.com/cotes2020/jekyll-theme-chirpy/commit/83bbe4ac939edfd1706e68c080562e3462f83519)) +* **ui:** show preview image in home page ([97b8dfe](https://github.com/cotes2020/jekyll-theme-chirpy/commit/97b8dfeed6ce7677f6472e28dc3b03f3c2968b12)) + +### Bug Fixes + +* parameter parsing error in image URL ([#1022](https://github.com/cotes2020/jekyll-theme-chirpy/issues/1022)) ([ee88cec](https://github.com/cotes2020/jekyll-theme-chirpy/commit/ee88cec270ea5938f98913a3edf28a684cfbd6c0)) +* **rss:** double quotes in the post title will break the XML structure ([#965](https://github.com/cotes2020/jekyll-theme-chirpy/issues/965)) ([1719d81](https://github.com/cotes2020/jekyll-theme-chirpy/commit/1719d81d00b32b107c35b3903089be84a9b28a6c)) + +### refactor + +* rename assets origin configuration files ([c283e77](https://github.com/cotes2020/jekyll-theme-chirpy/commit/c283e7782fa9562d82d9855fd280a573fd58c75f)) + +### Improvements + +* **assets:** reduce HTTP requests to CDN ([9d97120](https://github.com/cotes2020/jekyll-theme-chirpy/commit/9d971201978e993a9af337d9cd5396a1ea225f00)) +* calculate heading font size dynamically ([#983](https://github.com/cotes2020/jekyll-theme-chirpy/issues/983)) ([52f5ee9](https://github.com/cotes2020/jekyll-theme-chirpy/commit/52f5ee9cd3f92a6e8f25eaa203831546cda85db6)) +* **i18n:** set the global default locales to "en" ([#979](https://github.com/cotes2020/jekyll-theme-chirpy/issues/979)) ([61fdbcb](https://github.com/cotes2020/jekyll-theme-chirpy/commit/61fdbcb83a3601ecae62ec230602b94a5eb832e1)) +* **tools:** avoid initialization interruption in single branch forks ([#992](https://github.com/cotes2020/jekyll-theme-chirpy/issues/992)) ([e90461a](https://github.com/cotes2020/jekyll-theme-chirpy/commit/e90461aa3c81633863db6a12c5924ddba33bd08e)) +* **ui:** improve categories color in dark mode ([414dd13](https://github.com/cotes2020/jekyll-theme-chirpy/commit/414dd132aed70f4bd96cb712d00eacc82d2753e9)) +* **ui:** improve hover effect for post preview cards ([7626e4d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/7626e4d00544346a46b6e5ff2f3a99d234defe09)) +* **ui:** improve hover effect of trending tags ([34499f0](https://github.com/cotes2020/jekyll-theme-chirpy/commit/34499f0c927ce8fea3705dc2f0f0e6805cabda43)) +* **ui:** improve inline code in light mode ([e38309f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/e38309f3bd1302ffe60b682136b6efaf96f4d9ae)) +* **ui:** improve related posts design ([2918da9](https://github.com/cotes2020/jekyll-theme-chirpy/commit/2918da9f29465618d557c082ff3a2f23d7519049)) +* **ui:** improve the color of prompts in dark mode ([8cbbcfa](https://github.com/cotes2020/jekyll-theme-chirpy/commit/8cbbcfa26da0addd88affada23a65770250f2404)) +* **ui:** lighten the link color in light-mode ([7c23a4e](https://github.com/cotes2020/jekyll-theme-chirpy/commit/7c23a4ebc53b9e231c214e04f8ac0803cbcdb720)) +* **ui:** mute the marker in lists ([0c80552](https://github.com/cotes2020/jekyll-theme-chirpy/commit/0c80552d772b874e2a161f1270294faa3af18d4a)) +* **ui:** uniform the muted text color ([aadf939](https://github.com/cotes2020/jekyll-theme-chirpy/commit/aadf9393d5c7f7528d453c4e68eba4f5cbb85bd9)) +* **ux:** improve LQIP fade in effect ([003e7b6](https://github.com/cotes2020/jekyll-theme-chirpy/commit/003e7b60c93988a7bfae4c03a8346d4f8a5f0bb6)) + +## [5.6.1](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.6.0...v5.6.1) (2023-03-30) + +### Bug Fixes + +* **deps:** `tocbot` has no initialization detection ([#957](https://github.com/cotes2020/jekyll-theme-chirpy/issues/957)) ([8225174](https://github.com/cotes2020/jekyll-theme-chirpy/commit/8225174cb5e02fda7b3cc548ec821c876b0a5139)) +* mode-toggle leads to Disqus loading failure ([#945](https://github.com/cotes2020/jekyll-theme-chirpy/issues/945)) ([6fec411](https://github.com/cotes2020/jekyll-theme-chirpy/commit/6fec411c18ca5689c467c7b216ddeda02df23623)) +* pageviews not updated immediately ([8b4f99c](https://github.com/cotes2020/jekyll-theme-chirpy/commit/8b4f99c87f9a9227f47e84fb39d7b0f551d6f4dd)) + +## [5.6.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.5.2...v5.6.0) (2023-03-17) + +### Features + +* change TOC plugin to `tocbot` ([#774](https://github.com/cotes2020/jekyll-theme-chirpy/issues/774)) ([02b7bd5](https://github.com/cotes2020/jekyll-theme-chirpy/commit/02b7bd5095a2affe5b4c5ed7b5b182baaf642ff3)) +* **i18n:** add Greek Language Support. ([#903](https://github.com/cotes2020/jekyll-theme-chirpy/issues/903)) ([712a9b2](https://github.com/cotes2020/jekyll-theme-chirpy/commit/712a9b22401ce591cf4c0bb03fbdd1693fee30bb)) +* **ux:** turn home page posts into clickable cards ([#895](https://github.com/cotes2020/jekyll-theme-chirpy/issues/895)) ([b85f633](https://github.com/cotes2020/jekyll-theme-chirpy/commit/b85f6330dea666350631c4461b742cdb54c5f052)) + +### Bug Fixes + +* css selector string escaping vulnerability ([#888](https://github.com/cotes2020/jekyll-theme-chirpy/issues/888)) ([5c6ec9d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/5c6ec9d06b6571e2c0efe6652078442dca8af477)) +* mathematics cannot scroll horizontally ([#760](https://github.com/cotes2020/jekyll-theme-chirpy/issues/760)) ([4681df7](https://github.com/cotes2020/jekyll-theme-chirpy/commit/4681df715118a37ae1e91b588de0adb67f4e331a)) +* notch status bar doesn't match theme color ([#918](https://github.com/cotes2020/jekyll-theme-chirpy/issues/918)) ([820ba62](https://github.com/cotes2020/jekyll-theme-chirpy/commit/820ba62e9e939090523a7077d01d01bd78ec84eb)) +* some console snippets will be incompletely copied ([e8e4901](https://github.com/cotes2020/jekyll-theme-chirpy/commit/e8e4901e340dd7e5fc5f656dd3c7bcd6c97b886a)) + +## [5.5.2](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.5.1...v5.5.2) (2023-01-30) + +### Bug Fixes + +* position of prompt icon is incorrect in paragraph on mobile ([5df953f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/5df953f6c877e2aa3f1f4981c97a0b8007abe6d4)) + +## [5.5.1](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.5.0...v5.5.1) (2023-01-29) + +### Bug Fixes + +* the icon position of the prompts in the list is incorrect ([0c9558d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/0c9558de8a01e9ab795778f351a8bbf4d6b21763)) + +## [5.5.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.4.0...v5.5.0) (2023-01-29) + +### Features + +* **i18n:** add Arabic translation ([#857](https://github.com/cotes2020/jekyll-theme-chirpy/issues/857)) ([765af53](https://github.com/cotes2020/jekyll-theme-chirpy/commit/765af53b77e5c63804784d5728f5970ae274c2c7)) +* **i18n:** add Czech language ([#833](https://github.com/cotes2020/jekyll-theme-chirpy/issues/833)) ([98d48f5](https://github.com/cotes2020/jekyll-theme-chirpy/commit/98d48f5da412276d4a0c99cd01a87b19349bc6bc)) +* **i18n:** add Finnish translations ([#843](https://github.com/cotes2020/jekyll-theme-chirpy/issues/843)) ([d6d0318](https://github.com/cotes2020/jekyll-theme-chirpy/commit/d6d03183eaf94b44e037cc48b6e1c47cee183f6e)) +* **i18n:** add Italian translation ([#850](https://github.com/cotes2020/jekyll-theme-chirpy/issues/850)) ([9a011e1](https://github.com/cotes2020/jekyll-theme-chirpy/commit/9a011e14d66195d8b2fb9ec62f3e60a3e56cd032)) + +### Bug Fixes + +* copy command line incomplete(`.gp` part) ([41ed331](https://github.com/cotes2020/jekyll-theme-chirpy/commit/41ed33145639415148aec8e85edc7a6fd0de0ca3)) +* correct encoding of spaces in share URLs ([#835](https://github.com/cotes2020/jekyll-theme-chirpy/issues/835)) ([f2d2858](https://github.com/cotes2020/jekyll-theme-chirpy/commit/f2d285844e6e2979f2b0eec1d20073d3c05b6c0c)) +* post's image would cover the PWA update alert ([bd374dd](https://github.com/cotes2020/jekyll-theme-chirpy/commit/bd374dd383c50f89c8f018ecb4e25772eeb8f6d8)) +* prompt with nested blockquotes renders incorrectly ([#846](https://github.com/cotes2020/jekyll-theme-chirpy/issues/846)) ([babb4a0](https://github.com/cotes2020/jekyll-theme-chirpy/commit/babb4a0c5a58ceb2e4093bc465670accdd526c18)) + +## [5.4.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.3.2...v5.4.0) (2022-12-27) + +### Features + +* add `rel="me"` to Mastodon sidebar contact links for verification ([#807](https://github.com/cotes2020/jekyll-theme-chirpy/issues/807)) ([d2190c7](https://github.com/cotes2020/jekyll-theme-chirpy/commit/d2190c726f61c8c9732b88b4aecf699dc8bc7deb)) +* add embed video support ([ed6dc53](https://github.com/cotes2020/jekyll-theme-chirpy/commit/ed6dc539eff7003a3765bcd8c31ae5e91a863d65)) +* add shimmer background when image loads ([ab16fdc](https://github.com/cotes2020/jekyll-theme-chirpy/commit/ab16fdc7fc26811130b98a1773beb62bff6182e8)) +* set preview image ratio to 1.91 : 1 ([4b6ccbc](https://github.com/cotes2020/jekyll-theme-chirpy/commit/4b6ccbcbccce27b9fcb035812efefe4eb69301cf)) +* support dark and light mode images ([#481](https://github.com/cotes2020/jekyll-theme-chirpy/issues/481)) ([9306c7b](https://github.com/cotes2020/jekyll-theme-chirpy/commit/9306c7b39ecf9d9146bc1a25eebedc38eb2c3dd6)) +* support LQIP for images ([bffaf63](https://github.com/cotes2020/jekyll-theme-chirpy/commit/bffaf6374f265cec96ef743d42b46fbec3b59797)) + +### Bug Fixes + +* `hreflang` tag attribute of feed misses `site.alt_lang` ([7651d28](https://github.com/cotes2020/jekyll-theme-chirpy/commit/7651d2851b4bb7d8f0d068b62c036c89a1089bbc)) +* `og:image` will be incorrect if the image uses a cross-domain URL ([8de1abd](https://github.com/cotes2020/jekyll-theme-chirpy/commit/8de1abda6be3633982392178731431b0ddb1b52b)) +* refactoring error when the image URL contains parameters ([ec98f07](https://github.com/cotes2020/jekyll-theme-chirpy/commit/ec98f07aca0b80a9c07fbcdc8e0d7d66dba98ed2)) +* spaces in post title are encoded when sharing ([7efd2f8](https://github.com/cotes2020/jekyll-theme-chirpy/commit/7efd2f8aa2ea1c3aeb7d740bf9a018881c26fe65)) + +### Improvements + +* **cdn:** optimize cache policy for static assets ([7fb0ee0](https://github.com/cotes2020/jekyll-theme-chirpy/commit/7fb0ee0bedb63eee3f90a49c6d7fb8b5d78c9830)) + +## [5.3.2](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.3.1...v5.3.2) (2022-11-22) + +### Bug Fixes + +* `mermaid` occasionally fails to initialize ([#536](https://github.com/cotes2020/jekyll-theme-chirpy/issues/536)) ([48f14e3](https://github.com/cotes2020/jekyll-theme-chirpy/commit/48f14e39ac81bbfb3b9913ea3ee789d775b2d1ae)) +* **comment:** disqus doesn't follow theme mode switching ([b0d5956](https://github.com/cotes2020/jekyll-theme-chirpy/commit/b0d5956f5a0ed894984d6b1754efeba04d8bc966)) +* restore full-text search ([#741](https://github.com/cotes2020/jekyll-theme-chirpy/issues/741)) ([6774e0e](https://github.com/cotes2020/jekyll-theme-chirpy/commit/6774e0e1fb37cf467b14be481347412713763f05)) +* the image URL in the SEO-related tags is incomplete ([#754](https://github.com/cotes2020/jekyll-theme-chirpy/issues/754)) ([f6e9a3f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/f6e9a3fccf7ab34db71f8aefaf86fdcc05861076)) + +## [5.3.1](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.3.0...v5.3.1) (2022-10-25) + +### Bug Fixes + +* 404 page missing title in tablet/desktop view ([5511b28](https://github.com/cotes2020/jekyll-theme-chirpy/commit/5511b2883fd5a395fddfb642588d00c122f18da7)) +* prompt content overflows horizontally ([#705](https://github.com/cotes2020/jekyll-theme-chirpy/issues/705)) ([fb13e32](https://github.com/cotes2020/jekyll-theme-chirpy/commit/fb13e3219b5eca0d2e4f86a1ecabfab75240369f)) +* **tools:** multiple configuration files will fail the test ([80cb0b3](https://github.com/cotes2020/jekyll-theme-chirpy/commit/80cb0b371754e96772a7907877a8ce196398ba3d)) + +### Improvements + +* **layout:** improve the min-height of main content ([#674](https://github.com/cotes2020/jekyll-theme-chirpy/issues/674)) ([49bb93c](https://github.com/cotes2020/jekyll-theme-chirpy/commit/49bb93cc0c89ad9cfaad5edcf9cb28c3d5134575)) +* modify checkbox icon with `Liquid` ([1fd665b](https://github.com/cotes2020/jekyll-theme-chirpy/commit/1fd665bf4990c26ae23635c511c5abc9640184d1)) +* optimize the extra padding in lists ([#703](https://github.com/cotes2020/jekyll-theme-chirpy/issues/703)) ([39da11e](https://github.com/cotes2020/jekyll-theme-chirpy/commit/39da11e3f3685f49321757576d2b87a48bf25db5)), closes [#702](https://github.com/cotes2020/jekyll-theme-chirpy/issues/702) +* **posts:** improve core block bottom padding ([d2fb98b](https://github.com/cotes2020/jekyll-theme-chirpy/commit/d2fb98b3e57f2f6c3fc3816551cd0721731adf40)) +* truncate post content for search results ([647eea8](https://github.com/cotes2020/jekyll-theme-chirpy/commit/647eea8dbd716f9d3cb8330c3139fa753903f51d)) +* **typography:** optimize the line height of post content ([eac3f9b](https://github.com/cotes2020/jekyll-theme-chirpy/commit/eac3f9b434ca77e3dc64eea9cedea7b93e7b306b)) + +### Others + +* **giscus:** add `reactions-enabled` option ([#712](https://github.com/cotes2020/jekyll-theme-chirpy/issues/712)) ([70662a0](https://github.com/cotes2020/jekyll-theme-chirpy/commit/70662a0365e6b9378602dc0a57462ddad5aebcf5)) +* **locale:** restore options for changing date format ([#716](https://github.com/cotes2020/jekyll-theme-chirpy/issues/716)) ([f904e8c](https://github.com/cotes2020/jekyll-theme-chirpy/commit/f904e8cd48c343cc31e25859d9d50bfe2c056f41)) +* remove site config option `prefer_datetime_locale` ([6852ceb](https://github.com/cotes2020/jekyll-theme-chirpy/commit/6852ceb280927ff4e753a3e1131f2b396d9807d0)) + +## [5.3.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.2.1...v5.3.0) (2022-09-23) + +### Features + +* add multiple authors to a post ([#677](https://github.com/cotes2020/jekyll-theme-chirpy/issues/677)) ([f1d9e99](https://github.com/cotes2020/jekyll-theme-chirpy/commit/f1d9e99bc02d3cd0a6b0cd1beac545f0cc7a24f8)), closes [#675](https://github.com/cotes2020/jekyll-theme-chirpy/issues/675) +* **i18n:** add Bulgarian support ([#612](https://github.com/cotes2020/jekyll-theme-chirpy/issues/612)) ([2fed338](https://github.com/cotes2020/jekyll-theme-chirpy/commit/2fed338ce6d078bf528c9717201fbc475f88cd22)) +* **i18n:** add German locale file ([#663](https://github.com/cotes2020/jekyll-theme-chirpy/issues/663)) ([940b281](https://github.com/cotes2020/jekyll-theme-chirpy/commit/940b2810e95065e30600ae8d5e4612e7183da60e)) +* **i18n:** add Hungarian locale file ([#597](https://github.com/cotes2020/jekyll-theme-chirpy/issues/597), [#598](https://github.com/cotes2020/jekyll-theme-chirpy/issues/598)) ([b032977](https://github.com/cotes2020/jekyll-theme-chirpy/commit/b0329775fc24d0323e5cc04cda46ece8b4531802)) +* **i18n:** add Turkish language ([#631](https://github.com/cotes2020/jekyll-theme-chirpy/issues/631)) ([ad137fa](https://github.com/cotes2020/jekyll-theme-chirpy/commit/ad137fa2945b1870b9c1dd5e9212a5f4af7c3580)) + +### Bug Fixes + +* add missing color to linkedin icon for share list ([#683](https://github.com/cotes2020/jekyll-theme-chirpy/issues/683)) ([0dcd39d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/0dcd39d491c9c49e4acf7f75f83fe6e1d1839e37)) +* code contains spaces in headings ([#644](https://github.com/cotes2020/jekyll-theme-chirpy/issues/644)) ([3fa1bf3](https://github.com/cotes2020/jekyll-theme-chirpy/commit/3fa1bf305451f645a7f3aa93863b076463c8f165)) +* correct spelling of `panel` ([#686](https://github.com/cotes2020/jekyll-theme-chirpy/issues/686)) ([b288587](https://github.com/cotes2020/jekyll-theme-chirpy/commit/b288587c1c3d113a1c52c2d25fb46cddda348961)) +* correct the i18n for tab titles ([0c5b697](https://github.com/cotes2020/jekyll-theme-chirpy/commit/0c5b697fd3b283b6a5c926742b61ed49d8688c18)) +* the `code` doesn't wrap inside the prompt ([#626](https://github.com/cotes2020/jekyll-theme-chirpy/issues/626)) ([378b65a](https://github.com/cotes2020/jekyll-theme-chirpy/commit/378b65a0617787813519dde74d6f741f255eff3d)) + +## [5.2.1](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.2.0...v5.2.1) (2022-06-17) + +### Bug Fixes + +* exclude CHANGELOG from output ([971fe03](https://github.com/cotes2020/jekyll-theme-chirpy/commit/971fe03ec329ae49e7d60fe3af6101cfbd1acd6c)) +* **PWA:** sometimes update notification is not triggered ([96af729](https://github.com/cotes2020/jekyll-theme-chirpy/commit/96af7291ea5b2c5ed6372e7b6f7725e67c69f1ba)) + +## [5.2.0](https://github.com/cotes2020/jekyll-theme-chirpy/compare/v5.1.0...v5.2.0) (2022-06-09) + +### Features + +* add es-ES support to locales ([#533](https://github.com/cotes2020/jekyll-theme-chirpy/issues/533)) ([efe75ad](https://github.com/cotes2020/jekyll-theme-chirpy/commit/efe75adf2784956afb7a0b67f6634b146d9cb03b)) +* add fr-FR support to locales ([#582](https://github.com/cotes2020/jekyll-theme-chirpy/issues/582)) ([94e8144](https://github.com/cotes2020/jekyll-theme-chirpy/commit/94e81447afa457b1a6b7e8f487c47502803556d7)) +* add Vietnamese locale ([#517](https://github.com/cotes2020/jekyll-theme-chirpy/issues/517)) ([171463d](https://github.com/cotes2020/jekyll-theme-chirpy/commit/171463d76da9b7bc25dd327b8f0a868ea79e388b)) +* add pt-BR support to locales ([c2c503f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/c2c503f63336884282b6bda4ec0703d6ae76771b)) +* add option to turn off PWA ([#527](https://github.com/cotes2020/jekyll-theme-chirpy/issues/527)) ([106c981](https://github.com/cotes2020/jekyll-theme-chirpy/commit/106c981bac71e7434204a77e1f0c9c61d6eb1509)) +* **PWA:** add Service Worker update notification ([d127183](https://github.com/cotes2020/jekyll-theme-chirpy/commit/d127183b9774f6321e409acdb66bf8a85d8814be)) +* support showing description of preview image ([2bd6efa](https://github.com/cotes2020/jekyll-theme-chirpy/commit/2bd6efa95a174ac44e30a3af1e57e6f40d6e0e3a)) + +### Bug Fixes + +* alt is not a valid attribute for 'a' tag ([58928db](https://github.com/cotes2020/jekyll-theme-chirpy/commit/58928dbc9068db4e4cda4371eeae1865920dce6a)) +* assets URL is missing `baseurl` in self-hosted mode ([#591](https://github.com/cotes2020/jekyll-theme-chirpy/issues/591)) ([54124d5](https://github.com/cotes2020/jekyll-theme-chirpy/commit/54124d5134995fce52e4c2fc0a5d4d1743d6264d)) +* correct the `twitter:creator` of Twitter summary card ([96a16c8](https://github.com/cotes2020/jekyll-theme-chirpy/commit/96a16c868ede51e7dfa412de63ffa1e5a49add7f)) +* correctly URL encode share links ([4c1c8d8](https://github.com/cotes2020/jekyll-theme-chirpy/commit/4c1c8d8b0eacecbbaa2d522bbdd6430f350ff760)), closes [#496](https://github.com/cotes2020/jekyll-theme-chirpy/issues/496) +* follow paginate_path config for pagination ([6900d9f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/6900d9f2bc9380cbda4babf611c6eeff345291af)) +* force checkout of `gh-pages` branch ([#544](https://github.com/cotes2020/jekyll-theme-chirpy/issues/544)) ([5402523](https://github.com/cotes2020/jekyll-theme-chirpy/commit/5402523ae52a3740bcc15df0b226b2612644945d)) +* horizontal scroll for long equations ([#545](https://github.com/cotes2020/jekyll-theme-chirpy/issues/545)) ([30787fc](https://github.com/cotes2020/jekyll-theme-chirpy/commit/30787fc4cf151e955bb7afc26dfd859f1a06fce6)) +* p is not allowed in span ([4f590e2](https://github.com/cotes2020/jekyll-theme-chirpy/commit/4f590e2bba0639751771211bc0d357828ae70404)) +* remove whitespace from avatar URL ([#537](https://github.com/cotes2020/jekyll-theme-chirpy/issues/537)) ([0542b51](https://github.com/cotes2020/jekyll-theme-chirpy/commit/0542b5149c8287dca60e37f46ee36f31b43455e4)) +* resume the preview image SEO tag ([#529](https://github.com/cotes2020/jekyll-theme-chirpy/issues/529)) ([b8d1bcd](https://github.com/cotes2020/jekyll-theme-chirpy/commit/b8d1bcd3dea0abd1afef7ef154a4501fbb18938d)) +* script code should be in head or body, not in between ([2103191](https://github.com/cotes2020/jekyll-theme-chirpy/commit/2103191b2faf714a8e4418c7c347a1f942b51af8)) +* spurious header closing tags ([59e9557](https://github.com/cotes2020/jekyll-theme-chirpy/commit/59e955745f02f9b57c65af70b0979cd4a98bf53f)) +* table bypass refactoring when it contains IAL ([#519](https://github.com/cotes2020/jekyll-theme-chirpy/issues/519)) ([5d85ccb](https://github.com/cotes2020/jekyll-theme-chirpy/commit/5d85ccb9943aac88dbbefebe1c2234cdcbae5c53)) +* **theme mode:** `SCSS` syntax error ([#588](https://github.com/cotes2020/jekyll-theme-chirpy/issues/588)) ([76a1b6a](https://github.com/cotes2020/jekyll-theme-chirpy/commit/76a1b6a068c369138422dcd18ba08ec8cc3749a6)) +* use `jsonify` to generate valid json ([#521](https://github.com/cotes2020/jekyll-theme-chirpy/issues/521)) ([dd9d5a7](https://github.com/cotes2020/jekyll-theme-chirpy/commit/dd9d5a7207b746342d07176d8969dc4f2c380bf2)) +* when the `site.img_cdn` is set to the local path, the preview-image path loses the `baseurl` ([9cefe58](https://github.com/cotes2020/jekyll-theme-chirpy/commit/9cefe58993d9ea3a3a28424e7ffd8e0911567c5c)) + +### Improvements + +* avoid post pageviews from shifting while loading ([135a16f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/135a16f13ee783d9308669ff9a824847a73c951c)) +* avoid the layout shift for post datetime ([6d35f5f](https://github.com/cotes2020/jekyll-theme-chirpy/commit/6d35f5f8da044cfad071628bb53776de03efaae4)) +* **categories:** support singular and plural forms of locale ([#595](https://github.com/cotes2020/jekyll-theme-chirpy/issues/595)) ([35cadf9](https://github.com/cotes2020/jekyll-theme-chirpy/commit/35cadf969dd0161ee62503e242c545f006f7072b)) +* improve the responsive design for ultrawide screens ([#540](https://github.com/cotes2020/jekyll-theme-chirpy/issues/540)) ([5d6e8c5](https://github.com/cotes2020/jekyll-theme-chirpy/commit/5d6e8c5ef6aa71b4d2600c5305f6e8ba540557f7)) diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..7b5377f --- /dev/null +++ b/Gemfile @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gemspec + +group :test do + gem "html-proofer", "~> 3.18" +end + +# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem +# and associated library. +platforms :mingw, :x64_mingw, :mswin, :jruby do + gem "tzinfo", ">= 1", "< 3" + gem "tzinfo-data" +end + +# Performance-booster for watching directories on Windows +gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] + +# Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem +# do not have a Java counterpart. +gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby] + +# Lock jekyll-sass-converter to 2.x on Linux-musl +if RUBY_PLATFORM =~ /linux-musl/ + gem "jekyll-sass-converter", "~> 2.0" +end diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..299d89f --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2019 Cotes Chung + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..04ad93a --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +![workflow status](https://github.com/shameekagarwal/shameekagarwal.github.io/actions/workflows/pages-deploy.yml/badge.svg) + +# Docker Command + +```shell +docker run -it --rm --volume="$PWD:/srv/jekyll" -p 4000:4000 jekyll/jekyll jekyll serve +``` diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..a688bcd --- /dev/null +++ b/_config.yml @@ -0,0 +1,212 @@ +# The Site Configuration + +# Import the theme +theme: jekyll-theme-chirpy + +# Change the following value to '/PROJECT_NAME' ONLY IF your site type is GitHub Pages Project sites +# and doesn't have a custom domain. +baseurl: "" + +# The language of the webpage › http://www.lingoes.net/en/translator/langcode.htm +# If it has the same name as one of the files in folder `_data/locales`, the layout language will also be changed, +# otherwise, the layout language will use the default value of 'en'. +lang: en + +# Change to your timezone › http://www.timezoneconverter.com/cgi-bin/findzone/findzone +timezone: Asia/Kolkata + +# jekyll-seo-tag settings › https://github.com/jekyll/jekyll-seo-tag/blob/master/docs/usage.md +# ↓ -------------------------- + +title: Shameek Agarwal # the main title + +tagline: A Software Engineer # it will display as the sub-title + +description: >- # used by seo meta and the atom feed + A minimal, responsive and feature-rich Jekyll theme for technical writing. + +# fill in the protocol & hostname for your site, e.g., 'https://username.github.io' +url: "https://shameekagarwal.github.io" + +github: + username: shameekagarwal # change to your github username + +twitter: + username: twitter_username # change to your twitter username + +social: + # Change to your full name. + # It will be displayed as the default author of the posts and the copyright owner in the Footer + name: Shameek Agarwal + email: shameek.agarwal@gmail.com # change to your email address + links: + - 'https://www.linkedin.com/in/shameek-agarwal' # Fill with your Linkedin homepage + # The first element serves as the copyright owner's link + # - https://twitter.com/username # change to your twitter homepage + # - https://github.com/username # change to your github homepage + # Uncomment below to add more social links + # - https://www.facebook.com/username + # - https://www.linkedin.com/in/username + +google_site_verification: # fill in to your verification string + +# ↑ -------------------------- +# The end of `jekyll-seo-tag` settings + +google_analytics: + id: # fill in your Google Analytics ID + +# Prefer color scheme setting. +# +# Note: Keep empty will follow the system prefer color by default, +# and there will be a toggle to switch the theme between dark and light +# on the bottom left of the sidebar. +# +# Available options: +# +# light - Use the light color scheme +# dark - Use the dark color scheme +# +theme_mode: dark + +# The CDN endpoint for images. +# Notice that once it is assigned, the CDN url +# will be added to all image (site avatar & posts' images) paths starting with '/' +# +# e.g. 'https://cdn.com' +img_cdn: "" + +# the avatar on sidebar, support local or CORS resources +# avatar: "/assets/img/profile.jpg" + +# boolean type, the global switch for TOC in posts. +toc: true + +comments: + active: giscus # The global switch for posts comments, e.g., 'disqus'. Keep it empty means disable + # The active options are as follows: + disqus: + shortname: # fill with the Disqus shortname. › https://help.disqus.com/en/articles/1717111-what-s-a-shortname + # utterances settings › https://utteranc.es/ + utterances: + repo: shameekagarwal/shameekagarwal.github.io + issue_term: title + # Giscus options › https://giscus.app + giscus: + repo: shameekagarwal/shameekagarwal.github.io # / + repo_id: MDEwOlJlcG9zaXRvcnkzOTA3NzM5NzE= + category: Comments + category_id: DIC_kwDOF0q8084CX8ku + mapping: url # optional, default to 'pathname' + input_position: # optional, default to 'bottom' + lang: # optional, default to the value of `site.lang` + reactions_enabled: # optional, default to the value of `1` + +# + +# Self-hosted static assets, optional › https://github.com/cotes2020/chirpy-static-assets +assets: + self_host: + enabled: # boolean, keep empty means false + # specify the Jekyll environment, empty means both + # only works if `assets.self_host.enabled` is 'true' + env: # [development|production] + +pwa: + enabled: true # the option for PWA feature + +paginate: 10 + +# ------------ The following options are not recommended to be modified ------------------ + +kramdown: + syntax_highlighter: rouge + syntax_highlighter_opts: # Rouge Options › https://github.com/jneen/rouge#full-options + css_class: highlight + # default_lang: console + span: + line_numbers: false + block: + line_numbers: true + start_line: 1 + +collections: + tabs: + output: true + sort_by: order + +defaults: + - scope: + path: "_posts" # An empty string here means all files in the project + type: posts + values: + layout: post + comments: true # Enable comments in posts. + toc: true # Display TOC column in posts. + # DO NOT modify the following parameter unless you are confident enough + # to update the code of all other post links in this project. + permalink: /posts/:title/ + - scope: + path: _drafts + values: + comments: false + - scope: + path: "" + type: tabs # see `site.collections` + values: + layout: page + permalink: /:title/ + - scope: + path: assets/img/favicons + values: + swcache: true + - scope: + path: assets/js/dist + values: + swcache: true + +sass: + style: compressed + +compress_html: + clippings: all + comments: all + endings: all + profile: false + blanklines: false + ignore: + envs: [development] + +exclude: + - "*.gem" + - "*.gemspec" + - tools + - README.md + - CHANGELOG.md + - LICENSE + - rollup.config.js + - node_modules + - package*.json + +jekyll-archives: + enabled: [categories, tags] + layouts: + category: category + tag: tag + permalinks: + tag: /tags/:name/ + category: /categories/:name/ diff --git a/_data/authors.yml b/_data/authors.yml new file mode 100644 index 0000000..f012012 --- /dev/null +++ b/_data/authors.yml @@ -0,0 +1,17 @@ +## Template › https://github.com/jekyll/jekyll-seo-tag/blob/master/docs/advanced-usage.md#setting-author-url +# ------------------------------------- +# {author_id}: +# name: {full name} +# twitter: {twitter_of_author} +# url: {homepage_of_author} +# ------------------------------------- + +cotes: + name: Cotes Chung + twitter: cotes2020 + url: https://github.com/cotes2020/ + +sille_bille: + name: Dinesh Prasanth Moluguwan Krishnamoorthy + twitter: dinesh_MKD + url: https://github.com/SilleBille/ diff --git a/_data/contact.yml b/_data/contact.yml new file mode 100644 index 0000000..fbb9f13 --- /dev/null +++ b/_data/contact.yml @@ -0,0 +1,33 @@ +# The contact options. + +- type: github + icon: "fab fa-github" + +- type: gitlab + icon: "fab fa-gitlab" + url: 'https://gitlab.com/shameekagarwal' # Fill with your Linkedin homepage + +- type: email + icon: "fas fa-envelope" + noblank: true # open link in current tab + +#- type: rss +# icon: "fas fa-rss" +# noblank: true +# Uncomment and complete the url below to enable more contact options +# +# - type: mastodon +# icon: 'fab fa-mastodon' # icons powered by +# url: '' # Fill with your Mastodon account page, rel="me" will be applied for verification +# +- type: linkedin + icon: 'fab fa-linkedin' # icons powered by + url: 'https://www.linkedin.com/in/shameek-agarwal' # Fill with your Linkedin homepage +# +# - type: stack-overflow +# icon: 'fab fa-stack-overflow' +# url: '' # Fill with your stackoverflow homepage + +- type: phone + icon: 'fas fa-phone' + url: 'tel:+916290885679' diff --git a/_data/locales/ar.yml b/_data/locales/ar.yml new file mode 100644 index 0000000..c608298 --- /dev/null +++ b/_data/locales/ar.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: منشور + category: فئة + tag: وسم + +# The tabs of sidebar +tabs: + # format: : + home: الرئيسية + categories: الفئات + tags: الوسوم + archives: الأرشيف + about: حول + +# the text displayed in the search bar & search results +search: + hint: بحث + cancel: إلغاء + no_results: نأسف! لا يوجد نتائج. + +panel: + lastmod: المحدثة مؤخرا + trending_tags: الوسوم الشائعة + toc: محتويات + +copyright: + # Shown at the bottom of the post + license: + template: هذا المنشور تحت ترخيص :LICENSE_NAME بواسطة المؤلف. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: بعض الحقوق محفوظة. + verbose: >- + ما لم يذكر خلاف ذلك ، يتم ترخيص منشورات المدونة على هذا الموقع + بموجب ترخيص Creative Commons Attribution 4.0 International (CC BY 4.0) من قبل المؤلف. + +meta: باستخدام :PLATFORM السمة :THEME + +not_found: + statment: عذرا, الرابط التالي غير صالح أو انه يشير إلى صفحة غير موجودة. + +notification: + update_found: يتوفر اصدار جديد للمحتوى. + update: تحديث + +# ----- Posts related labels ----- + +post: + written_by: بواسطة + posted: نشّر + updated: حدّث + words: كلمات + pageview_measure: مشاهدات + read_time: + unit: دقيقة + prompt: قراءة + relate_posts: إقرأ المزيد + share: شارك + button: + next: الأجدد + previous: الأقدم + copy_code: + succeed: تم النسخ! + share_link: + title: أنسخ الرابط + succeed: تم نسخ الرابط بنجاح! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: فئة + plural: فئات + post_measure: + singular: منشور + plural: منشورات diff --git a/_data/locales/bg-BG.yml b/_data/locales/bg-BG.yml new file mode 100644 index 0000000..3e04993 --- /dev/null +++ b/_data/locales/bg-BG.yml @@ -0,0 +1,81 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Публикация + category: Категория + tag: Таг + +# The tabs of sidebar +tabs: + # format: : + home: Начало + categories: Категории + tags: Тагове + archives: Архив + about: За мен + +# the text displayed in the search bar & search results +search: + hint: търси + cancel: Отмени + no_results: Упс! Не са намерени резултати. + +panel: + lastmod: Наскоро обновени + trending_tags: Популярни тагове + toc: Съдържание + +copyright: + # Shown at the bottom of the post + license: + template: Тази публикация е лицензирана под :LICENSE_NAME от автора. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Някои права запазени. + verbose: >- + Освен ако не е посочено друго, публикациите в блога на този сайт са лицензирани + под лиценза Creative Commons Attribution 4.0 (CC BY 4.0) от автора. + +meta: Създадено чрез :PLATFORM и :THEME тема + +not_found: + statment: Съжалявам, но на този URL адрес няма налично съдържание. + +notification: + update_found: Налична е нова версия на съдържанието. + update: Обнови + +# ----- Posts related labels ----- + +post: + written_by: Автор + posted: Публикувана + updated: Обновена + words: думи + pageview_measure: преглеждания + read_time: + unit: мин + prompt: четиво + relate_posts: Още за четене + share: Споделете + button: + next: По-нови + previous: По-стари + copy_code: + succeed: Копирано! + share_link: + title: Копирай линк + succeed: Линкът е копиран успешно! + +# categories page +categories: + category_measure: + singular: категория + plural: категории + post_measure: + singular: публикация + plural: публикации diff --git a/_data/locales/cs-CZ.yml b/_data/locales/cs-CZ.yml new file mode 100644 index 0000000..e515c08 --- /dev/null +++ b/_data/locales/cs-CZ.yml @@ -0,0 +1,89 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Příspěvek + category: Kategorie + tag: Štítek + +# The tabs of sidebar +tabs: + # format: : + home: Domů + categories: Kategorie + tags: Štítky + archives: Archivy + about: O mně + +# the text displayed in the search bar & search results +search: + hint: hledat + cancel: Zrušit + no_results: Ups! Žádný výsledek nenalezen. + +panel: + lastmod: Nedávno aktualizováno + trending_tags: Trendy štítky + toc: Obsah + +copyright: + # Shown at the bottom of the post + license: + template: Tento příspěvek je licencován pod :LICENSE_NAME autorem. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Některá práva vyhrazena. + verbose: >- + Pokud není uvedeno jinak, jsou příspěvky na tomto webu licencovány + pod licencí Creative Commons Attribution 4.0 International (CC BY 4.0) Licence autora. + +meta: Použití :PLATFORM s motivem :THEME + +not_found: + statment: Omlouváme se, adresu URL jsme špatně umístili nebo odkazuje na něco, co neexistuje. + +notification: + update_found: Je k dispozici nová verze obsahu. + update: Aktualizace + +# ----- Posts related labels ----- + +post: + written_by: Od + posted: Zveřejněno + updated: Aktualizováno + words: slova + pageview_measure: zhlednutí + read_time: + unit: minut + prompt: čtení + relate_posts: Další čtení + share: Sdílet + button: + next: Novější + previous: Starší + copy_code: + succeed: Zkopírováno! + share_link: + title: Kopírovat odkaz + succeed: Zkopírováno! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: kategorie + post_measure: + singular: příspěvěk + plural: příspěvky diff --git a/_data/locales/de-DE.yml b/_data/locales/de-DE.yml new file mode 100644 index 0000000..7ea3956 --- /dev/null +++ b/_data/locales/de-DE.yml @@ -0,0 +1,80 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Eintrag + category: Kategorie + tag: Tag + +# The tabs of sidebar +tabs: + # format: : + home: Startseite + categories: Kategorien + tags: Tags + archives: Archiv + about: Über + +# the text displayed in the search bar & search results +search: + hint: Suche + cancel: Abbrechen + no_results: Ups! Keine Einträge gefunden. + +panel: + lastmod: Kürzlich aktualisiert + trending_tags: Beliebte Tags + toc: Inhalt + +copyright: + # Shown at the bottom of the post + license: + template: Dieser Eintrag ist vom Autor unter :LICENSE_NAME lizensiert. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Einige Rechte vorbehalten. + verbose: >- + Alle Einträge auf dieser Seite stehen, soweit nicht anders angegeben, unter der Lizenz Creative Commons Attribution 4.0 (CC BY 4.0). + +meta: Powered by :PLATFORM with :THEME theme + +not_found: + statment: Entschuldigung, dieser Link verweist auf keine vorhandene Ressource. + +notification: + update_found: Eine neue Version ist verfügbar. + update: Neue Version + +# ----- Posts related labels ----- + +post: + written_by: Von + posted: Veröffentlicht + updated: Aktualisiert + words: Wörter + pageview_measure: Aufrufe + read_time: + unit: Minuten + prompt: lesen + relate_posts: Weiterlesen + share: Teilen + button: + next: Nächster Eintrag + previous: Eintrag vorher + copy_code: + succeed: Kopiert! + share_link: + title: Link kopieren + succeed: Link erfolgreich kopiert! + +# categories page +categories: + category_measure: + singular: Kategorie + plural: Kategorien + post_measure: + singular: Eintrag + plural: Einträge diff --git a/_data/locales/el-GR.yml b/_data/locales/el-GR.yml new file mode 100644 index 0000000..ab5fb0e --- /dev/null +++ b/_data/locales/el-GR.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Δημοσίευση + category: Κατηγορία + tag: Ετικέτα + +# The tabs of sidebar +tabs: + # format: : + home: Home + categories: Κατηγορίες + tags: Ετικέτες + archives: Αρχεία + about: Σχετικά + +# the text displayed in the search bar & search results +search: + hint: αναζήτηση + cancel: Ακύρωση + no_results: Oops! Κανένα αποτέλεσμα δεν βρέθηκε. + +panel: + lastmod: Σχετικά ενημερωμένα + trending_tags: Ετικέτες τάσης + toc: Περιεχόμενα + +copyright: + # Shown at the bottom of the post + license: + template: Η δημοσίευση αυτή βρίσκεται υπο την άδεια :LICENSE_NAME Greekforce1821. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Ορισμένα δικαιώματα reserved. + verbose: >- + Εκτός αλλού ή οπουδήποτε αλλού, τα blog posts σε αυτήν την σελίδα βρίσκονται υπο την άδεια + Creative Commons Attribution 4.0 International (CC BY 4.0) του δημιουργού. + +meta: Αξιοποιώντας την :PLATFORM theme :THEME + +not_found: + statment: Συγνώμη, έχουμε τοποθετήσει λάθος αυτήν την διεύθυνση URL ή υποδεικνύει κάτι που δεν υπάρχει. + +notification: + update_found: Υπάρχει διαθέσιμη μια νέα έκδοση του περιεχομένου. + update: Ενημέρωση + +# ----- Posts related labels ----- + +post: + written_by: Από + posted: Δημοσιεύθηκε + updated: Ενημερώθηκε + words: λέξεις + pageview_measure: προβολές + read_time: + unit: Λεπτά + prompt: διαβάσματος + relate_posts: Περισσότερα + share: Κοινοποιήστε + button: + next: Νεότερα + previous: Παλαιότερα + copy_code: + succeed: Αντιγράφθηκε! + share_link: + title: Αντιγραφή συνδέσμου + succeed: Η διεύθυνση αντιγράφθηκε με επιτυχία! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: Κατηγορία + plural: Κατηγορίες + post_measure: + singular: Δημοσίευση + plural: Δημοσιεύσεις diff --git a/_data/locales/en.yml b/_data/locales/en.yml new file mode 100644 index 0000000..2f3f339 --- /dev/null +++ b/_data/locales/en.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Post + category: Category + tag: Tag + +# The tabs of sidebar +tabs: + # format: : + home: Home + categories: Categories + tags: Tags + archives: Archives + about: About + +# the text displayed in the search bar & search results +search: + hint: search + cancel: Cancel + no_results: Oops! No results found. + +panel: + lastmod: Recently Updated + trending_tags: Trending Tags + toc: Contents + +copyright: + # Shown at the bottom of the post + license: + template: This post is licensed under :LICENSE_NAME by the author. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Some rights reserved. + verbose: >- + Except where otherwise noted, the blog posts on this site are licensed + under the Creative Commons Attribution 4.0 International (CC BY 4.0) License by the author. + +meta: Using the :PLATFORM theme :THEME + +not_found: + statment: Sorry, we've misplaced that URL or it's pointing to something that doesn't exist. + +notification: + update_found: A new version of content is available. + update: Update + +# ----- Posts related labels ----- + +post: + written_by: By + posted: Posted + updated: Updated + words: words + pageview_measure: views + read_time: + unit: min + prompt: read + relate_posts: Further Reading + share: Share + button: + next: Newer + previous: Older + copy_code: + succeed: Copied! + share_link: + title: Copy link + succeed: Link copied successfully! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: category + plural: categories + post_measure: + singular: post + plural: posts diff --git a/_data/locales/es-ES.yml b/_data/locales/es-ES.yml new file mode 100644 index 0000000..5529230 --- /dev/null +++ b/_data/locales/es-ES.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Entrada + category: Categoría + tag: Etiqueta + +# The tabs of sidebar +tabs: + # format: : + home: Inicio + categories: Categorías + tags: Etiquetas + archives: Archivo + about: Acerca de + +# the text displayed in the search bar & search results +search: + hint: Buscar + cancel: Cancelar + no_results: ¡Oops! No se encuentran resultados. + +panel: + lastmod: Actualizado recientemente + trending_tags: Etiquetas populares + toc: Contenido + +copyright: + # Shown at the bottom of the post + license: + template: Esta entrada está licenciada bajo :LICENSE_NAME por el autor. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Algunos derechos reservados. + verbose: >- + Salvo que se indique explícitamente, las entradas de este blog están licenciadas + bajo la Creative Commons Attribution 4.0 International (CC BY 4.0) License por el autor. + +meta: Hecho con :PLATFORM usando el tema :THEME + +not_found: + statment: Lo sentimos, hemos perdido esa URL o apunta a algo que no existe. + +notification: + update_found: Hay una nueva versión de contenido disponible. + update: Actualizar + +# ----- Posts related labels ----- + +post: + written_by: Por + posted: Publicado + updated: Actualizado + words: palabras + pageview_measure: visitas + read_time: + unit: min + prompt: " de lectura" + relate_posts: Lecturas adicionales + share: Compartir + button: + next: Nuevo + previous: Anterior + copy_code: + succeed: ¡Copiado! + share_link: + title: Copiar enlace + succeed: ¡Enlace copiado! + +# categories page +categories: + category_measure: categorias + post_measure: entradas diff --git a/_data/locales/fi-FI.yml b/_data/locales/fi-FI.yml new file mode 100644 index 0000000..c817d2b --- /dev/null +++ b/_data/locales/fi-FI.yml @@ -0,0 +1,90 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Julkaisu + category: Kateogoria + tag: Tagi + +# The tabs of sidebar +tabs: + # format: : + home: Koti + categories: Kateogoriat + tags: Tagit + archives: Arkistot + about: Minusta + +# the text displayed in the search bar & search results +search: + hint: etsi + cancel: Peruuta + no_results: Hups! Ei tuloksia. + +panel: + lastmod: Viimeksi päivitetty + trending_tags: Trendaavat tagit + toc: Sisältö + +copyright: + # Shown at the bottom of the post + license: + template: Tämä julkaisu on lisenssoitu :LICENSE_NAME julkaisijan toimesta. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Jotkut oikeudet pidätetään. + verbose: >- + Paitsi jos erikseen mainitaan on kaikki sisältö Creative Commons Attribution 4.0 International (CC BY 4.0) Lisensoitu kirjoittajan toimesta. + +meta: Käytetään :PLATFORM iä Teema :THEME + +not_found: + statment: Valitettavasti tällä URL-osoitteella ei ole saatavilla sisältöä. + +notification: + update_found: Uusi versio sisällöstä on saatavilla. + update: Päivitä + +# ----- Posts related labels ----- + +post: + written_by: Kirjoittaja + posted: Julkaistu + updated: Päivitetty + words: sanaa + pageview_measure: katselukertoja + read_time: + unit: minuuttia + prompt: lukea + relate_posts: Jatka lukemista + share: Jaa + button: + next: Uudempi + previous: Vanhempi + copy_code: + succeed: Kopiotu! + share_link: + title: Kopioi linkki + succeed: Linkki kopioitu onnistuneesti! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: kategoria + plural: kategoriat + post_measure: + singular: julkaisu + plural: julkaisut diff --git a/_data/locales/fr-FR.yml b/_data/locales/fr-FR.yml new file mode 100644 index 0000000..72b034d --- /dev/null +++ b/_data/locales/fr-FR.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Post + category: Catégorie + tag: Tag + +# The tabs of sidebar +tabs: + # format: : + home: Accueil + categories: Catégories + tags: Tags + archives: Archives + about: A propos de + +# the text displayed in the search bar & search results +search: + hint: recherche + cancel: Annuler + no_results: Oups ! Aucun résultat trouvé. + +panel: + lastmod: Récemment mis à jour + trending_tags: Tags tendance + toc: Contenu + +copyright: + # Shown at the bottom of the post + license: + template: Cet article est sous licence :LICENSE_NAME par l'auteur. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Certains droits réservés. + verbose: >- + Sauf mention contraire, les articles de ce site sont publiés sous licence + sous la licence Creative Commons Attribution 4.0 International (CC BY 4.0) par l'auteur. + +meta: Propulsé par :PLATFORM avec le thème :THEME + +not_found: + statment: Désolé, nous avons égaré cette URL ou elle pointe vers quelque chose qui n'existe pas. + +notification: + update_found: Une nouvelle version du contenu est disponible. + update: Mise à jour + +# ----- Posts related labels ----- + +post: + written_by: Par + posted: Posté + updated: Mis à jour + words: mots + pageview_measure: vues + read_time: + unit: min + prompt: lire + relate_posts: Autres lectures + share: Partager + button: + next: Plus récent + previous: Plus ancien + copy_code: + succeed: Copié ! + share_link: + title: Copier le lien + succeed: Lien copié avec succès ! + +# categories page +categories: + category_measure: catégories + post_measure: posts diff --git a/_data/locales/hu-HU.yml b/_data/locales/hu-HU.yml new file mode 100644 index 0000000..b09f2cd --- /dev/null +++ b/_data/locales/hu-HU.yml @@ -0,0 +1,79 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Bejegyzés + category: Kategória + tag: Címke + +# The tabs of sidebar +tabs: + # format: : + home: Kezdőlap + categories: Kategóriák + tags: Címkék + archives: Archívum + about: Rólam + +# the text displayed in the search bar & search results +search: + hint: keresés + cancel: Mégse + no_results: Oops! Nincs találat a keresésre. + +panel: + lastmod: Legutóbb frissítve + trending_tags: Népszerű Címkék + toc: Tartalom + links: Blog linkek + +copyright: + # Shown at the bottom of the post + license: + template: A bejegyzés :LICENSE_NAME licenccel rendelkezik. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Néhány jog fenntartva. + verbose: >- + Az oldalon található tartalmak + Creative Commons Attribution 4.0 International (CC BY 4.0) licenccel rendelkeznek, + hacsak másképp nincs jelezve. + +meta: Készítve :PLATFORM motorral :THEME témával + +not_found: + statment: Sajnáljuk, az URL-t rosszul helyeztük el, vagy valami nem létezőre mutat. + +notification: + update_found: Elérhető a tartalom új verziója. + update: Frissítés + +# ----- Posts related labels ----- + +post: + written_by: Szerző + posted: Létrehozva + updated: Frissítve + words: szó + pageview_measure: látogató + read_time: + unit: perc + prompt: elolvasni + relate_posts: További olvasnivaló + share: Megosztás + button: + next: Újabb + previous: Régebbi + copy_code: + succeed: Másolva! + share_link: + title: Link másolása + succeed: Link sikeresen másolva! + +# categories page +categories: + category_measure: kategória + post_measure: bejegyzés diff --git a/_data/locales/id-ID.yml b/_data/locales/id-ID.yml new file mode 100644 index 0000000..29ad156 --- /dev/null +++ b/_data/locales/id-ID.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Postingan + category: Kategori + tag: Tagar + +# The tabs of sidebar +tabs: + # format: : + home: Beranda + categories: Kategori + tags: Tagar + archives: Arsip + about: Tentang + +# the text displayed in the search bar & search results +search: + hint: Cari + cancel: Batal + no_results: Ups! Tidak ada hasil yang ditemukan. + +panel: + lastmod: Postingan Terbaru + trending_tags: Tagar Terpopuler + toc: Konten + +copyright: + # Shown at the bottom of the post + license: + template: Postingan ini dilisensikan di bawah :LICENSE_NAME oleh penulis. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Sebagian konten dilindungi. + verbose: >- + Kecuali jika dinyatakan, Postingan blog di situs ini dilisensikan + di bawah Lisensi Creative Commons Attribution 4.0 International (CC BY 4.0) oleh penulis. + +meta: Didukung oleh :PLATFORM dengan tema :THEME + +not_found: + statment: Maaf, kami gagal menemukan URL itu atau memang mengarah ke sesuatu yang tidak ada. + +notification: + update_found: Versi konten baru tersedia. + update: Perbarui + +# ----- Posts related labels ----- + +post: + written_by: Oleh + posted: Diterbitkan + updated: Diperbarui + words: kata + pageview_measure: dilihat + read_time: + unit: menit + prompt: baca + relate_posts: Postingan Lainya + share: Bagikan + button: + next: Terbaru + previous: Terlama + copy_code: + succeed: Disalin! + share_link: + title: Salin tautan + succeed: Tautan berhasil disalin! + +# categories page +categories: + category_measure: kategori + post_measure: Postingan diff --git a/_data/locales/it-IT.yml b/_data/locales/it-IT.yml new file mode 100644 index 0000000..cf7b691 --- /dev/null +++ b/_data/locales/it-IT.yml @@ -0,0 +1,90 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Post + category: Categoria + tag: Tag + +# The tabs of sidebar +tabs: + # format: : + home: Pagina principale + categories: Categorie + tags: Tags + archives: Archivio + about: Informazioni + +# the text displayed in the search bar & search results +search: + hint: ricerca + cancel: Cancella + no_results: Oops! La ricerca non ha fornito risultati. + +panel: + lastmod: Aggiornati recentemente + trending_tags: Tags più cliccati + toc: Contenuti + +copyright: + # Shown at the bottom of the post + license: + template: Questo post è sotto licenza :LICENSE_NAME a nome dell'autore. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Alcuni diritti riservati. + verbose: >- + Eccetto quando esplicitamente menzionato, i post di questo blog sono da ritenersi sotto + i termini di licenza Creative Commons Attribution 4.0 International (CC BY 4.0). + +meta: Servizio offerto da :PLATFORM con tema :THEME +not_found: + statment: Ci scusiamo, non è stato possibile trovare l'URL in questione. Potrebbe puntare ad una pagina non esistente. + +notification: + update_found: Nuova versione del contenuto disponibile. + update: Aggiornamento + +# ----- Posts related labels ----- + +post: + written_by: Da + posted: Postato + updated: Aggiornato + words: parole + pageview_measure: visioni + read_time: + unit: min + prompt: lettura + relate_posts: Continua a leggere + share: Condividi + button: + next: Più recenti + previous: Meno recenti + copy_code: + succeed: Copiato! + share_link: + title: Copia link + succeed: Link copiato con successo! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: categoria + plural: categorie + post_measure: + singular: post + plural: posts diff --git a/_data/locales/ko-KR.yml b/_data/locales/ko-KR.yml new file mode 100644 index 0000000..4dd221b --- /dev/null +++ b/_data/locales/ko-KR.yml @@ -0,0 +1,84 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: 포스트 + category: 카테고리 + tag: 태그 + +# The tabs of sidebar +tabs: + # format: : + home: 홈 + categories: 카테고리 + tags: 태그 + archives: 아카이브 + about: 정보 + +# the text displayed in the search bar & search results +search: + hint: 검색 + cancel: 취소 + no_results: 검색 결과가 없습니다. + +panel: + lastmod: 최근 업데이트 + trending_tags: 인기 태그 + toc: 바로가기 + +copyright: + # Shown at the bottom of the post + license: + template: 이 기사는 저작권자의 :LICENSE_NAME 라이센스를 따릅니다. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: 일부 권리 보유 + verbose: >- + 명시되지 않는 한 이 사이트의 블로그 게시물은 작성자의 + Creative Commons Attribution 4.0 International(CC BY 4.0) 라이선스에 따라 사용이 허가되었습니다. + +meta: Powered by :PLATFORM with :THEME theme + +not_found: + statment: 해당 URL은 존재하지 않습니다. + +notification: + update_found: 새 버전의 콘텐츠를 사용할 수 있습니다. + update: 업데이트 + +# ----- Posts related labels ----- + +post: + written_by: By + posted: 게시 + updated: 업데이트 + words: 단어 + pageview_measure: 조회 + read_time: + unit: 분 + prompt: 읽는 시간 + relate_posts: 관련된 글 + share: 공유하기 + button: + next: 다음 글 + previous: 이전 글 + copy_code: + succeed: 복사되었습니다! + share_link: + title: 링크 복사하기 + succeed: 링크가 복사되었습니다! + +# Date time format. +# See: , +df: + post: + strftime: "%Y/%m/%d" + dayjs: "YYYY/MM/DD" + +# categories page +categories: + category_measure: 카테고리 + post_measure: 포스트 diff --git a/_data/locales/my-MM.yml b/_data/locales/my-MM.yml new file mode 100644 index 0000000..98848d5 --- /dev/null +++ b/_data/locales/my-MM.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: ပို့စ် + category: ကဏ္ဍ + tag: နာမ(တက်ဂ်) + +# The tabs of sidebar +tabs: + # format: : + home: အဓိကစာမျက်နှာ + categories: ကဏ္ဍများ + tags: နာမ(တက်ဂ်)များ + archives: မှတ်တမ်း​တိုက် + about: အကြောင်းအရာ + +# the text displayed in the search bar & search results +search: + hint: ရှာဖွေမည် + cancel: ဖျက်သိမ်းမည် + no_results: အိုး! ဘာမှမရှိပါ + +panel: + lastmod: မကြာသေးမီကမွမ်းမံထားသည် + trending_tags: ခေတ်စားနေသည့်တက်ဂ်များ + toc: အကြောင်းအရာများ + +copyright: + # Shown at the bottom of the post + license: + template: ဤပို့စ်သည်စာရေးသူ၏ :LICENSE_NAME လိုင်စင်ရထားသည်။ + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: မူပိုင်ခွင့်အချို့ကို လက်ဝယ်ထားသည်။ + verbose: >- + အခြားမှတ်သားထားချက်များမှလွဲ၍ ဤဆိုက်ရှိ ဘလော့ဂ်ပို့စ်များသည် စာရေးသူ၏ + Creative Commons Attribution 4.0 International (CC BY 4.0) အောက်တွင် လိုင်စင်ရထားပါသည်။ + +meta: Powered by :PLATFORM with :THEME theme + +not_found: + statment: ဝမ်းနည်းပါသည်၊ ကျွန်ုပ်တို့သည် အဆိုပါ URL ကို မှားယွင်းစွာ နေရာချထားခြင်း သို့မဟုတ် ၎င်းသည် မရှိသောအရာကို ညွှန်ပြနေပါသည်။ + +notification: + update_found: အကြောင်းအရာဗားရှင်းအသစ်ကို ရနိုင်ပါပြီ။ + update: အပ်ဒိတ် + +# ----- Posts related labels ----- + +post: + written_by: ကရေးသားခဲ့သည်။ + posted: တင်ထားခဲ့သည်။ + updated: မွမ်းမံထားခဲ့သည်။ + words: စကားလုံးများ + pageview_measure: အမြင်များ + read_time: + unit: မိနစ် + prompt: ဖတ်ပါမည် + relate_posts: နောက်ထပ်ဖတ်ရန် + share: မျှဝေရန် + button: + next: အသစ်များ + previous: အဟောင်းများ + copy_code: + succeed: ကူးယူလိုက်ပြီ။ + share_link: + title: လင့်ခ်ကို ကူးယူရန် + succeed: လင့်ခ်ကို ကူးယူလိုက်ပြီ။ + +# categories page +categories: + category_measure: ကဏ္ဍများ + post_measure: ပို့စ်များ diff --git a/_data/locales/pt-BR.yml b/_data/locales/pt-BR.yml new file mode 100644 index 0000000..4cef833 --- /dev/null +++ b/_data/locales/pt-BR.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Post + category: Categoria + tag: Tag + +# The tabs of sidebar +tabs: + # format: : + home: Home + categories: Categorias + tags: Tags + archives: Arquivos + about: Sobre + +# the text displayed in the search bar & search results +search: + hint: Buscar + cancel: Cancelar + no_results: Oops! Nenhum resultado encontrado. + +panel: + lastmod: Atualizados recentemente + trending_tags: Trending Tags + toc: Conteúdo + +copyright: + # Shown at the bottom of the post + license: + template: Esta postagem está licenciada sob :LICENSE_NAME pelo autor. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Alguns direitos reservados. + verbose: >- + Exceto onde indicado de outra forma, as postagens do blog neste site são licenciadas sob a + Creative Commons Attribution 4.0 International (CC BY 4.0) License pelo autor. + +meta: Feito com :PLATFORM usando o tema :THEME + +not_found: + statment: Desculpe, a página não foi encontrada. + +notification: + update_found: Uma nova versão do conteúdo está disponível. + update: atualização + +# ----- Posts related labels ----- + +post: + written_by: Por + posted: Postado em + updated: Atualizado + words: palavras + pageview_measure: visualizações + read_time: + unit: min + prompt: " de leitura" + relate_posts: Leia também + share: Compartilhar + button: + next: Próximo + previous: Anterior + copy_code: + succeed: Copiado! + share_link: + title: Copie o link + succeed: Link copiado com sucesso! + +# categories page +categories: + category_measure: categorias + post_measure: posts diff --git a/_data/locales/ru-RU.yml b/_data/locales/ru-RU.yml new file mode 100644 index 0000000..4377300 --- /dev/null +++ b/_data/locales/ru-RU.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Публикация + category: Категория + tag: Тег + +# The tabs of sidebar +tabs: + # format: : + home: Домашняя страница + categories: Категории + tags: Теги + archives: Архив + about: О сайте + +# the text displayed in the search bar & search results +search: + hint: поиск + cancel: Отменить + no_results: Ох! Ничего не найдено. + +panel: + lastmod: Недавно обновлено + trending_tags: Популярные теги + toc: Содержание + +copyright: + # Shown at the bottom of the post + license: + template: Публикация защищена лицензией :LICENSE_NAME. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Некоторые права защищены. + verbose: >- + Публикации на сайте защищены лицензией Creative Commons Attribution 4.0 International (CC BY 4.0), + если в тексте публикации не указано иное. + +meta: Powered by :PLATFORM with :THEME theme + +not_found: + statment: Извините, эта ссылка указывает на ресурс который не существует. + +notification: + update_found: Доступна новая версия контента. + update: Обновлять + +# ----- Posts related labels ----- + +post: + written_by: Автор + posted: Время публикации + updated: Обновлено + words: слов + pageview_measure: просмотров + read_time: + unit: минут + prompt: чтения + relate_posts: Вам также может быть интересно + share: Поделиться + button: + next: Предыдущая публикация + previous: Следующая публикация + copy_code: + succeed: Скопировано успешно! + share_link: + title: Скопировать ссылку + succeed: Ссылка успешно скопирована! + +# categories page +categories: + category_measure: категории + post_measure: публикации diff --git a/_data/locales/sl-SI.yml b/_data/locales/sl-SI.yml new file mode 100644 index 0000000..7ab18b1 --- /dev/null +++ b/_data/locales/sl-SI.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Objava #Post + category: Kategorija #Category + tag: Oznaka #Tag + +# The tabs of sidebar +tabs: + # format: : + home: Domov #Home + categories: Kategorije #Categories + tags: Oznake #Tags + archives: Arhiv #Archives + about: O meni #About + +# the text displayed in the search bar & search results +search: + hint: išči #search + cancel: Prekliči #Cancel + no_results: Ups! Vsebina ni bila najdena #Oops! No results found. + +panel: + lastmod: Nedavno Posodobljeno #Recently Updated + trending_tags: Priljubljene Oznake #Trending Tags + toc: Vsebina #Contents + +copyright: + # Shown at the bottom of the post + license: + template: Ta objava je licencirana pod :LICENCE_NAME s strani avtorja. #This post is licensed under :LICENSE_NAME by the author. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Nekatere pravice pridržane. #Some rights reserved. + verbose: >- + Razen kjer navedeno drugače, vse objave spletnega dnevnika so licencirane + pod Creative Commons Attribution 4.0 International (CC BY 4.0) s strani avtorja. + +meta: Uporabljena :PLATFORM tema :THEME #Using the :PLATFORM theme :THEME + +not_found: + statment: Oprostite, hiperpovezava je neustrezna ali vsebina ne obstajata. #Sorry, we've misplaced that URL or it's pointing to something that doesn't exist. + +notification: + update_found: Novejša različica vsebine je na voljo. #A new version of content is available. + update: Posodobi #Update + +# ----- Posts related labels ----- + +post: + written_by: Od #By + posted: Objavljeno #Posted + updated: Posodobljeno #Updated + words: besede #words + pageview_measure: ogledi #views + read_time: + unit: min + prompt: beri #read + relate_posts: Nadaljnje branje #Further Reading + share: Deli #Share + button: + next: Novejše #Newer + previous: Starejše #Older + copy_code: + succeed: Kopirano! #Copied! + share_link: + title: Kopiraj povezavo #Copy link + succeed: Povezava uspešno kopirana! #Link copied successfully! + +# Date time format. +# See: , +df: + post: + strftime: "%e %b, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: kategorija #category + plural: kategorije #categories + post_measure: + singular: objava #post + plural: objave #posts diff --git a/_data/locales/sv-SE.yml b/_data/locales/sv-SE.yml new file mode 100644 index 0000000..7ec2ee2 --- /dev/null +++ b/_data/locales/sv-SE.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Inlägg #Post + category: Kategori #Category + tag: Tagga #Tag + +# The tabs of sidebar +tabs: + # format: : + home: Hem #Home + categories: Kategorier #Categories + tags: Taggar #Tags + archives: Arkiv #Archives + about: Om #About + +# the text displayed in the search bar & search results +search: + hint: sök + cancel: Avbryt + no_results: Hoppsan! Hittade inga sökträffar. + +panel: + lastmod: Senast uppdaterad + trending_tags: Trendande taggar + toc: Innehåll + +copyright: + # Shown at the bottom of the post + license: + template: Den här posten är publicerad under licensen :LICENSE_NAME av författaren. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Vissa rättigheter är reserverade. + verbose: >- + Om inte annat anges är blogginläggen på denna webbplats licensierade + under Creative Commons Attribution 4.0 International (CC BY 4.0) av författaren. + +meta: Byggd med :PLATFORM och temat :THEME + +not_found: + statment: Ursäkta, vi har tappat bort den här webbadressen eller så pekar den på något som inte längre finns. + +notification: + update_found: Det finns en ny version av innehållet. + update: Uppdatera sidan + +# ----- Posts related labels ----- + +post: + written_by: Av + posted: Postad + updated: Uppdaterad + words: ord + pageview_measure: visningar + read_time: + unit: min + prompt: läsning + relate_posts: Mer läsning + share: Dela + button: + next: Nyare + previous: Äldre + copy_code: + succeed: Kopierat! + share_link: + title: Kopiera länk + succeed: Länken har kopierats! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: kategori + plural: kategorier + post_measure: + singular: inlägg + plural: inlägg diff --git a/_data/locales/th.yml b/_data/locales/th.yml new file mode 100644 index 0000000..22cb00a --- /dev/null +++ b/_data/locales/th.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: โพสต์ + category: หมวดหมู่ + tag: แท็ก + +# The tabs of sidebar +tabs: + # format: : + home: หน้าแรก + categories: หมวดหมู่ + tags: แท็ก + archives: คลังเก็บ + about: เกี่ยวกับ + +# the text displayed in the search bar & search results +search: + hint: ค้นหา + cancel: ยกเลิก + no_results: โอ๊ะ! ไม่พบผลลัพธ์ + +panel: + lastmod: อัปเดตล่าสุด + trending_tags: แท็กยอดนิยม + toc: เนื้อหา + +copyright: + # Shown at the bottom of the post + license: + template: โพสต์นี้อยู่ภายใต้การอนุญาต :LICENSE_NAME โดยผู้เขียน + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: สงวนลิขสิทธิ์เป็นบางส่วน + verbose: >- + เว้นแต่ว่าจะระบุเป็นอย่างอื่น โพสต์บนเว็บไซต์นี้อยู่ภายใต้ + สัญญาอนุญาตครีเอทีฟคอมมอนส์แบบ 4.0 นานาชาติ (CC BY 4.0) โดยผู้เขียน + +meta: กำลังใช้ธีมของ :PLATFORM ชื่อ :THEME + +not_found: + statment: ขออภัย เราวาง URL นั้นไว้ผิดที่ หรือมันชี้ไปยังสิ่งที่ไม่มีอยู่ + +notification: + update_found: มีเวอร์ชันใหม่ของเนื้อหา + update: อัปเดต + +# ----- Posts related labels ----- + +post: + written_by: โดย + posted: โพสต์เมื่อ + updated: อัปเดตเมื่อ + words: คำ + pageview_measure: ครั้ง + read_time: + unit: นาที + prompt: อ่าน + relate_posts: อ่านต่อ + share: แชร์ + button: + next: ใหม่กว่า + previous: เก่ากว่า + copy_code: + succeed: คัดลอกแล้ว! + share_link: + title: คัดลอกลิงก์ + succeed: คัดลอกลิงก์เรียบร้อยแล้ว! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: หมวดหมู่ + plural: หมวดหมู่ + post_measure: + singular: โพสต์ + plural: โพสต์ diff --git a/_data/locales/tr-TR.yml b/_data/locales/tr-TR.yml new file mode 100644 index 0000000..851f5fc --- /dev/null +++ b/_data/locales/tr-TR.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Gönderi + category: Kategori + tag: Etiket + +# The tabs of sidebar +tabs: + # format: : + home: Ana Sayfa + categories: Kategoriler + tags: Etiketler + archives: Arşiv + about: Hakkında + +# the text displayed in the search bar & search results +search: + hint: Ara... + cancel: İptal + no_results: Hop! Öyle bir şey bulamadım. + +panel: + lastmod: Yeni Güncellendi + trending_tags: Yükselen Etiketler + toc: İçindekiler + +copyright: + # Shown at the bottom of the post + license: + template: Bu gönderi :LICENSE_NAME lisansı altındadır. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/deed.tr + + # Displayed in the footer + brief: Bazı hakları saklıdır. + verbose: >- + Aksi belirtilmediği sürece, bu sitedeki gönderiler Creative Commons Atıf 4.0 Uluslararası (CC BY 4.0) Lisansı altındadır. + Kısaca sayfa linkini de vererek paylaşabilir veya düzenleyip paylaşabilirsin. + +meta: :PLATFORM ve :THEME teması + +not_found: + statment: Üzgünüz, bu linki yanlış yerleştirdik veya var olmayan bir şeye işaret ediyor. + +notification: + update_found: İçeriğin yeni bir sürümü mevcut. + update: Güncelle + +# ----- Posts related labels ----- + +post: + written_by: Yazan + posted: Gönderilme Tarihi + updated: Güncellenme Tarihi + words: sözcük + pageview_measure: görüntülenme + read_time: + unit: dakikada + prompt: okunabilir + relate_posts: Benzer Gönderiler + share: Paylaş + button: + next: İleri + previous: Geri + copy_code: + succeed: Kopyalandı. + share_link: + title: Linki kopyala + succeed: Link kopyalandı. + +# categories page +categories: + category_measure: kategori + post_measure: gönderi diff --git a/_data/locales/uk-UA.yml b/_data/locales/uk-UA.yml new file mode 100644 index 0000000..b605073 --- /dev/null +++ b/_data/locales/uk-UA.yml @@ -0,0 +1,77 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Публікація + category: Категорія + tag: Тег + +# The tabs of sidebar +tabs: + # format: : + home: Домашня сторінка + categories: Категорії + tags: Теги + archives: Архів + about: Про сайт + +# the text displayed in the search bar & search results +search: + hint: пошук + cancel: Скасувати + no_results: Ох! Нічого не знайдено. + +panel: + lastmod: Нещодавно оновлено + trending_tags: Популярні теги + toc: Зміст + +copyright: + # Shown at the bottom of the post + license: + template: Публікація захищена ліцензією :LICENSE_NAME. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Деякі права захищено. + verbose: >- + Публікації на сайті захищено ліцензією Creative Commons Attribution 4.0 International (CC BY 4.0), + якщо інше не вказано в тексті. + +meta: Powered by :PLATFORM with :THEME theme + +not_found: + statment: Вибачте, це посилання вказує на ресурс, що не існує. + +notification: + update_found: Доступна нова версія вмісту. + update: Оновлення + +# ----- Posts related labels ----- + +post: + written_by: Автор + posted: Час публікації + updated: Оновлено + words: слів + pageview_measure: переглядів + read_time: + unit: хвилин + prompt: читання + relate_posts: Вас також може зацікавити + share: Поділитися + button: + next: Попередня публікація + previous: Наступна публікація + copy_code: + succeed: Успішно скопійовано! + share_link: + title: Скопіювати посилання + succeed: Посилання успішно скопійовано! + +# categories page +categories: + category_measure: категорії + post_measure: публікації diff --git a/_data/locales/vi-VN.yml b/_data/locales/vi-VN.yml new file mode 100644 index 0000000..617431a --- /dev/null +++ b/_data/locales/vi-VN.yml @@ -0,0 +1,76 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Bài viết + category: Danh mục + tag: Thẻ + +# The tabs of sidebar +tabs: + # format: : + home: Trang chủ + categories: Các danh mục + tags: Các thẻ + archives: Lưu trữ + about: Giới thiệu + +# the text displayed in the search bar & search results +search: + hint: tìm kiếm + cancel: Hủy + no_results: Không có kết quả tìm kiếm. + +panel: + lastmod: Mới cập nhật + trending_tags: Các thẻ thịnh hành + toc: Mục lục + +copyright: + # Shown at the bottom of the post + license: + template: Bài viết này được cấp phép bởi tác giả theo giấy phép :LICENSE_NAME. + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: Một số quyền được bảo lưu. + verbose: >- + Trừ khi có ghi chú khác, các bài viết đăng trên trang này được cấp phép bởi tác giả theo giấy phép Creative Commons Attribution 4.0 International (CC BY 4.0). + +meta: Trang web này được tạo bởi :PLATFORM với chủ đề :THEME + +not_found: + statment: Xin lỗi, chúng tôi đã đặt nhầm URL hoặc đường dẫn trỏ đến một trang nào đó không tồn tại. + +notification: + update_found: Đã có phiên bản mới của nội dung. + update: Cập nhật + +# ----- Posts related labels ----- + +post: + written_by: Viết bởi + posted: Đăng lúc + updated: Cập nhật lúc + words: từ + pageview_measure: lượt xem + read_time: + unit: phút + prompt: đọc + relate_posts: Bài viết liên quan + share: Chia sẻ + button: + next: Mới hơn + previous: Cũ hơn + copy_code: + succeed: Đã sao chép! + share_link: + title: Sao chép đường dẫn + succeed: Đã sao chép đường dẫn thành công! + +# categories page +categories: + category_measure: danh mục + post_measure: bài viết diff --git a/_data/locales/zh-CN.yml b/_data/locales/zh-CN.yml new file mode 100644 index 0000000..f828134 --- /dev/null +++ b/_data/locales/zh-CN.yml @@ -0,0 +1,83 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: 文章 + category: 分类 + tag: 标签 + +# The tabs of sidebar +tabs: + # format: : + home: 首页 + categories: 分类 + tags: 标签 + archives: 归档 + about: 关于 + +# the text displayed in the search bar & search results +search: + hint: 搜索 + cancel: 取消 + no_results: 搜索结果为空 + +panel: + lastmod: 最近更新 + trending_tags: 热门标签 + toc: 文章内容 + +copyright: + # Shown at the bottom of the post + license: + template: 本文由作者按照 :LICENSE_NAME 进行授权 + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: 保留部分权利。 + verbose: >- + 除非另有说明,本网站上的博客文章均由作者按照知识共享署名 4.0 国际 (CC BY 4.0) 许可协议进行授权。 + +meta: 本站采用 :PLATFORM 主题 :THEME + +not_found: + statment: 抱歉,我们放错了该 URL,或者它指向了不存在的内容。 + +notification: + update_found: 发现新版本的内容。 + update: 更新 + +# ----- Posts related labels ----- + +post: + written_by: 作者 + posted: 发表于 + updated: 更新于 + words: 字 + pageview_measure: 次浏览 + read_time: + unit: 分钟 + prompt: 阅读 + relate_posts: 相关文章 + share: 分享 + button: + next: 下一篇 + previous: 上一篇 + copy_code: + succeed: 已复制! + share_link: + title: 分享链接 + succeed: 链接已复制! + +# Date time format. +# See: , +df: + post: + strftime: "%Y/%m/%d" + dayjs: "YYYY/MM/DD" + +# categories page +categories: + category_measure: 个分类 + post_measure: 篇文章 diff --git a/_data/locales/zh-TW.yml b/_data/locales/zh-TW.yml new file mode 100644 index 0000000..911253b --- /dev/null +++ b/_data/locales/zh-TW.yml @@ -0,0 +1,83 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: 文章 + category: 分類 + tag: 標籤 + +# The tabs of sidebar +tabs: + # format: : + home: 首頁 + categories: 分類 + tags: 標籤 + archives: 封存 + about: 關於 + +# the text displayed in the search bar & search results +search: + hint: 搜尋 + cancel: 取消 + no_results: 沒有搜尋結果 + +panel: + lastmod: 最近更新 + trending_tags: 熱門標籤 + toc: 文章摘要 + +copyright: + # Shown at the bottom of the post + license: + template: 本文章以 :LICENSE_NAME 授權 + name: CC BY 4.0 + link: https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: 保留部份權利。 + verbose: >- + 除非另有說明,否則本網誌的文章均由作者按照姓名標示 4.0 國際 (CC BY 4.0) 授權條款進行授權。 + +meta: 本網站使用 :PLATFORM 產生,採用 :THEME 主題 + +not_found: + statment: 抱歉,您可能正在存取一個已被移動的 URL,或者它從未存在。 + +notification: + update_found: 發現新版本更新。 + update: 更新 + +# ----- Posts related labels ----- + +post: + written_by: 作者 + posted: 發布於 + updated: 更新於 + words: 字 + pageview_measure: 次瀏覽 + read_time: + unit: 分鐘 + prompt: 閱讀 + relate_posts: 相關文章 + share: 分享 + button: + next: 下一篇 + previous: 上一篇 + copy_code: + succeed: 已複製! + share_link: + title: 分享連結 + succeed: 已複製連結! + +# Date time format. +# See: , +df: + post: + strftime: "%Y/%m/%d" + dayjs: "YYYY/MM/DD" + +# categories page +categories: + category_measure: 個分類 + post_measure: 篇文章 diff --git a/_data/origin/basic.yml b/_data/origin/basic.yml new file mode 100644 index 0000000..14d865a --- /dev/null +++ b/_data/origin/basic.yml @@ -0,0 +1,48 @@ +# fonts + +webfonts: /assets/lib/fonts/main.css + +# Libraries + +jquery: + js: /assets/lib/jquery/jquery.min.js + +bootstrap: + css: /assets/lib/bootstrap/bootstrap.min.css + js: /assets/lib/bootstrap/bootstrap.bundle.min.js + +toc: + css: /assets/lib/tocbot/tocbot.min.css + js: /assets/lib/tocbot/tocbot.min.js + +fontawesome: + css: /assets/lib/fontawesome-free/css/all.min.css + +search: + js: /assets/lib/simple-jekyll-search/simple-jekyll-search.min.js + +mermaid: + js: /assets/lib/mermaid/mermaid.min.js + +dayjs: + js: + common: /assets/lib/dayjs/dayjs.min.js + locale: /assets/lib/dayjs/locale/en.min.js + relativeTime: /assets/lib/dayjs/plugin/relativeTime.min.js + localizedFormat: /assets/lib/dayjs/plugin/localizedFormat.min.js + +magnific-popup: + css: /assets/lib/magnific-popup/magnific-popup.css + js: /assets/lib/magnific-popup/jquery.magnific-popup.min.js + +lazysizes: + js: /assets/lib/lazysizes/lazysizes.min.js + +clipboard: + js: /assets/lib/clipboard/clipboard.min.js + +polyfill: + js: /assets/lib/polyfill-v3-es6/polyfill.min.js + +mathjax: + js: /assets/lib/mathjax/tex-chtml.js diff --git a/_data/origin/cors.yml b/_data/origin/cors.yml new file mode 100644 index 0000000..2d28bba --- /dev/null +++ b/_data/origin/cors.yml @@ -0,0 +1,59 @@ +# CDNs + +cdns: + # Google Fonts + - url: https://fonts.googleapis.com + - url: https://fonts.gstatic.com + args: crossorigin + - url: https://fonts.googleapis.com + # jsDelivr CDN + - url: https://cdn.jsdelivr.net + +# fonts + +webfonts: https://fonts.googleapis.com/css2?family=Lato&family=Source+Sans+Pro:wght@400;600;700;900&display=swap + +# Libraries + +jquery: + js: https://cdn.jsdelivr.net/npm/jquery@3.7.0/dist/jquery.min.js + +bootstrap: + css: https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/css/bootstrap.min.css + js: https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/js/bootstrap.bundle.min.js + +toc: + css: https://cdn.jsdelivr.net/npm/tocbot@4.21.0/dist/tocbot.min.css + js: https://cdn.jsdelivr.net/npm/tocbot@4.21.0/dist/tocbot.min.js + +fontawesome: + css: https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@6.4.0/css/all.min.css + +search: + js: https://cdn.jsdelivr.net/npm/simple-jekyll-search@1.10.0/dest/simple-jekyll-search.min.js + +mermaid: + js: https://cdn.jsdelivr.net/npm/mermaid@9.4.3/dist/mermaid.min.js + +dayjs: + js: + common: https://cdn.jsdelivr.net/npm/dayjs@1.11.7/dayjs.min.js + locale: https://cdn.jsdelivr.net/npm/dayjs@1.11.7/locale/:LOCALE.min.js + relativeTime: https://cdn.jsdelivr.net/npm/dayjs@1.11.7/plugin/relativeTime.min.js + localizedFormat: https://cdn.jsdelivr.net/npm/dayjs@1.11.7/plugin/localizedFormat.min.js + +magnific-popup: + css: https://cdn.jsdelivr.net/npm/magnific-popup@1.1.0/dist/magnific-popup.min.css + js: https://cdn.jsdelivr.net/npm/magnific-popup@1.1.0/dist/jquery.magnific-popup.min.js + +lazysizes: + js: https://cdn.jsdelivr.net/npm/lazysizes@5.3.2/lazysizes.min.js + +clipboard: + js: https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js + +polyfill: + js: https://polyfill.io/v3/polyfill.min.js?features=es6 + +mathjax: + js: https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/tex-chtml.js diff --git a/_data/share.yml b/_data/share.yml new file mode 100644 index 0000000..c1d4d63 --- /dev/null +++ b/_data/share.yml @@ -0,0 +1,25 @@ +# Sharing options at the bottom of the post. +# Icons from + +platforms: + - type: Twitter + icon: "fab fa-twitter" + link: "https://twitter.com/intent/tweet?text=TITLE&url=URL" + + - type: Facebook + icon: "fab fa-facebook-square" + link: "https://www.facebook.com/sharer/sharer.php?title=TITLE&u=URL" + + - type: Telegram + icon: "fab fa-telegram" + link: "https://t.me/share/url?url=URL&text=TITLE" + + # Uncomment below if you need to. + # + # - type: Linkedin + # icon: "fab fa-linkedin" + # link: "https://www.linkedin.com/sharing/share-offsite/?url=URL" + # + # - type: Weibo + # icon: "fab fa-weibo" + # link: "http://service.weibo.com/share/share.php?title=TITLE&url=URL" diff --git a/_includes/comments.html b/_includes/comments.html new file mode 100644 index 0000000..39e521f --- /dev/null +++ b/_includes/comments.html @@ -0,0 +1,5 @@ + +{% if page.comments and site.comments.active %} + {% capture path %}comments/{{ site.comments.active }}.html{% endcapture %} + {% include {{ path }} %} +{% endif %} diff --git a/_includes/comments/disqus.html b/_includes/comments/disqus.html new file mode 100644 index 0000000..d2f59df --- /dev/null +++ b/_includes/comments/disqus.html @@ -0,0 +1,49 @@ + +
+

Comments powered by Disqus.

+
+ + diff --git a/_includes/comments/giscus.html b/_includes/comments/giscus.html new file mode 100644 index 0000000..ed918a9 --- /dev/null +++ b/_includes/comments/giscus.html @@ -0,0 +1,64 @@ + + diff --git a/_includes/comments/utterances.html b/_includes/comments/utterances.html new file mode 100644 index 0000000..afd7cd3 --- /dev/null +++ b/_includes/comments/utterances.html @@ -0,0 +1,51 @@ + + + + diff --git a/_includes/datetime.html b/_includes/datetime.html new file mode 100644 index 0000000..53258ba --- /dev/null +++ b/_includes/datetime.html @@ -0,0 +1,19 @@ + + +{% assign wrap_elem = include.wrap | default: 'em' %} +{% assign df_strftime = site.data.locales[include.lang].df.post.strftime | default: '%d/%m/%Y' %} +{% assign df_dayjs = site.data.locales[include.lang].df.post.dayjs | default: 'DD/MM/YYYY' %} + +<{{ wrap_elem }} + class="{% if include.class %}{{ include.class }}{% endif %}" + data-ts="{{ include.date | date: '%s' }}" + data-df="{{ df_dayjs }}" + {% if include.tooltip %} + data-bs-toggle="tooltip" data-bs-placement="bottom" + {% endif %} +> + {{ include.date | date: df_strftime }} + diff --git a/_includes/embed/twitch.html b/_includes/embed/twitch.html new file mode 100644 index 0000000..ab0419a --- /dev/null +++ b/_includes/embed/twitch.html @@ -0,0 +1,4 @@ + diff --git a/_includes/embed/youtube.html b/_includes/embed/youtube.html new file mode 100644 index 0000000..715063c --- /dev/null +++ b/_includes/embed/youtube.html @@ -0,0 +1,6 @@ + diff --git a/_includes/favicons.html b/_includes/favicons.html new file mode 100644 index 0000000..201f6d8 --- /dev/null +++ b/_includes/favicons.html @@ -0,0 +1,17 @@ + + +{% capture favicon_path %}{{ '/assets/img/favicons' | relative_url }}{% endcapture %} + + + + + + + + + + + diff --git a/_includes/footer.html b/_includes/footer.html new file mode 100644 index 0000000..3b36c4a --- /dev/null +++ b/_includes/footer.html @@ -0,0 +1,34 @@ + + +
+
+
+

+ {%- capture _platform -%} + Jekyll + {%- endcapture -%} + + {%- capture _theme -%} + Chirpy + {%- endcapture -%} + + {{ site.data.locales[include.lang].meta | replace: ':PLATFORM', _platform | replace: ':THEME', _theme }} +

+ +

+ {{- '©' }} + {{ 'now' | date: '%Y' }} + {{ site.social.name }}. + {% if site.data.locales[include.lang].copyright.brief %} + + {{- site.data.locales[include.lang].copyright.brief -}} + + {% endif %} +

+
+
+
diff --git a/_includes/google-analytics.html b/_includes/google-analytics.html new file mode 100644 index 0000000..e5e5119 --- /dev/null +++ b/_includes/google-analytics.html @@ -0,0 +1,14 @@ + + + + diff --git a/_includes/head.html b/_includes/head.html new file mode 100644 index 0000000..e4bfcb6 --- /dev/null +++ b/_includes/head.html @@ -0,0 +1,95 @@ + + + + + + + + + + + {% capture seo_tags %} + {% seo title=false %} + {% endcapture %} + + {% if page.image %} + {% assign img = page.image.path | default: page.image %} + + {% unless img contains '://' %} + {% assign img_path = page.img_path | append: '/' | append: img | replace: '//', '/' %} + {% capture target %}"{{ img | absolute_url }}"{% endcapture %} + + {% if site.img_cdn contains '//' %} + + {% capture replacement %}"{{ site.img_cdn }}{{ img_path }}"{% endcapture %} + {% else %} + + {%- capture replacement -%} + "{{ site.img_cdn | append: '/' | append: img_path | replace: '//', '/' | absolute_url }}" + {%- endcapture -%} + {% endif %} + + {% assign seo_tags = seo_tags | replace: target, replacement %} + {% endunless %} + {% endif %} + + {{ seo_tags }} + + + {%- unless page.layout == 'home' -%} + {{ page.title | append: ' | ' }} + {%- endunless -%} + {{ site.title }} + + + {% include_cached favicons.html %} + + {% if site.resources.ignore_env != jekyll.environment and site.resources.self_hosted %} + + + {% else %} + {% for cdn in site.data.origin[type].cdns %} + + + {% endfor %} + + + {% endif %} + + + {% if jekyll.environment == 'production' and site.google_analytics.id != empty and site.google_analytics.id %} + + + + + + {% endif %} + + + + + + + + + + {% if site.toc and page.toc %} + + {% endif %} + + {% if page.layout == 'page' or page.layout == 'post' %} + + + {% endif %} + + + + {% unless site.theme_mode %} + {% include mode-toggle.html %} + {% endunless %} + + {% include metadata-hook.html %} + diff --git a/_includes/js-selector.html b/_includes/js-selector.html new file mode 100644 index 0000000..f6c8e9d --- /dev/null +++ b/_includes/js-selector.html @@ -0,0 +1,106 @@ + + + + +{% assign urls = site.data.origin[type].jquery.js + | append: ',' + | append: site.data.origin[type].bootstrap.js + | append: ',' + | append: site.data.origin[type].search.js +%} + + + +{% if page.layout == 'post' or page.layout == 'page' or page.layout == 'home' %} + {% assign urls = urls | append: ',' | append: site.data.origin[type].lazysizes.js %} + + {% unless page.layout == 'home' %} + + {% assign urls = urls + | append: ',' + | append: site.data.origin[type]['magnific-popup'].js + | append: ',' + | append: site.data.origin[type].clipboard.js + %} + {% endunless %} +{% endif %} + +{% if page.layout == 'home' + or page.layout == 'post' + or page.layout == 'archives' + or page.layout == 'category' + or page.layout == 'tag' +%} + {% assign locale = site.lang | split: '-' | first %} + + {% assign urls = urls + | append: ',' + | append: site.data.origin[type].dayjs.js.common + | append: ',' + | append: site.data.origin[type].dayjs.js.locale + | replace: ':LOCALE', locale + | append: ',' + | append: site.data.origin[type].dayjs.js.relativeTime + | append: ',' + | append: site.data.origin[type].dayjs.js.localizedFormat + %} +{% endif %} + +{% if page.content contains ' + +{% if page.math %} + + + + +{% endif %} + +{% if jekyll.environment == 'production' %} + + {% if site.pwa.enabled %} + + {% else %} + + {% endif %} + + + {% if site.google_analytics.id != empty and site.google_analytics.id %} + {% include google-analytics.html %} + {% endif %} +{% endif %} diff --git a/_includes/jsdelivr-combine.html b/_includes/jsdelivr-combine.html new file mode 100644 index 0000000..cffa699 --- /dev/null +++ b/_includes/jsdelivr-combine.html @@ -0,0 +1,26 @@ +{% assign urls = include.urls | split: ',' %} + +{% assign combined_urls = nil %} + +{% assign domain = 'https://cdn.jsdelivr.net/' %} + +{% for url in urls %} + {% if url contains domain %} + {% assign url_snippet = url | slice: domain.size, url.size %} + + {% if combined_urls %} + {% assign combined_urls = combined_urls | append: ',' | append: url_snippet %} + {% else %} + {% assign combined_urls = domain | append: 'combine/' | append: url_snippet %} + {% endif %} + + {% elsif url contains '//' %} + + {% else %} + + {% endif %} +{% endfor %} + +{% if combined_urls %} + +{% endif %} diff --git a/_includes/lang.html b/_includes/lang.html new file mode 100644 index 0000000..19558a0 --- /dev/null +++ b/_includes/lang.html @@ -0,0 +1,8 @@ +{% comment %} + Detect appearance language and return it through variable "lang" +{% endcomment %} +{% if site.data.locales[site.lang] %} + {% assign lang = site.lang %} +{% else %} + {% assign lang = 'en' %} +{% endif %} diff --git a/_includes/language-alias.html b/_includes/language-alias.html new file mode 100644 index 0000000..abfa7ba --- /dev/null +++ b/_includes/language-alias.html @@ -0,0 +1,70 @@ +{% comment %} + + Convert the alias of the syntax language to the official name + + See: + +{% endcomment %} + +{% assign _lang = include.language | default: '' %} + +{% case _lang %} + {% when 'actionscript', 'as', 'as3' %} + {{ 'ActionScript' }} + {% when 'applescript' %} + {{ 'AppleScript' }} + {% when 'brightscript', 'bs', 'brs' %} + {{ 'BrightScript' }} + {% when 'cfscript', 'cfc' %} + {{ 'CFScript' }} + {% when 'coffeescript', 'coffee', 'coffee-script' %} + {{ 'CoffeeScript' }} + {% when 'cs', 'csharp' %} + {{ 'C#' }} + {% when 'erl' %} + {{ 'Erlang' }} + {% when 'graphql' %} + {{ 'GraphQL' }} + {% when 'haskell', 'hs' %} + {{ 'Haskell' }} + {% when 'javascript', 'js' %} + {{ 'JavaScript' }} + {% when 'make', 'mf', 'gnumake', 'bsdmake' %} + {{ 'Makefile' }} + {% when 'md', 'mkd' %} + {{ 'Markdown' }} + {% when 'm' %} + {{ 'Matlab' }} + {% when 'objective_c', 'objc', 'obj-c', 'obj_c', 'objectivec' %} + {{ 'Objective-C' }} + {% when 'perl', 'pl' %} + {{ 'Perl' }} + {% when 'php','php3','php4','php5' %} + {{ 'PHP' }} + {% when 'py' %} + {{ 'Python' }} + {% when 'rb' %} + {{ 'Ruby' }} + {% when 'rs','no_run','ignore','should_panic' %} + {{ 'Rust' }} + {% when 'bash', 'zsh', 'ksh', 'sh' %} + {{ 'Shell' }} + {% when 'st', 'squeak' %} + {{ 'Smalltalk' }} + {% when 'tex'%} + {{ 'TeX' }} + {% when 'latex' %} + {{ 'LaTex' }} + {% when 'ts', 'typescript' %} + {{ 'TypeScript' }} + {% when 'vb', 'visualbasic' %} + {{ 'Visual Basic' }} + {% when 'vue', 'vuejs' %} + {{ 'Vue.js' }} + {% when 'yml' %} + {{ 'YAML' }} + {% when 'css', 'html', 'scss', 'ssh', 'toml', 'xml', 'yaml', 'json' %} + {{ _lang | upcase }} + {% else %} + {{ _lang | capitalize }} +{% endcase %} diff --git a/_includes/mermaid.html b/_includes/mermaid.html new file mode 100644 index 0000000..967cfb4 --- /dev/null +++ b/_includes/mermaid.html @@ -0,0 +1,58 @@ + + diff --git a/_includes/metadata-hook.html b/_includes/metadata-hook.html new file mode 100644 index 0000000..fd7e9bd --- /dev/null +++ b/_includes/metadata-hook.html @@ -0,0 +1 @@ + diff --git a/_includes/mode-toggle.html b/_includes/mode-toggle.html new file mode 100644 index 0000000..a347750 --- /dev/null +++ b/_includes/mode-toggle.html @@ -0,0 +1,143 @@ + + + diff --git a/_includes/no-linenos.html b/_includes/no-linenos.html new file mode 100644 index 0000000..8500693 --- /dev/null +++ b/_includes/no-linenos.html @@ -0,0 +1,10 @@ +{% comment %} + Remove the line number of the code snippet. +{% endcomment %} + +{% assign content = include.content %} + +{% if content contains '
' %}
+  {% assign content = content | replace: '
', '' %}
+{% endif %}
diff --git a/_includes/origin-type.html b/_includes/origin-type.html
new file mode 100644
index 0000000..7f72012
--- /dev/null
+++ b/_includes/origin-type.html
@@ -0,0 +1,13 @@
+{% comment %} Site static assets origin type {% endcomment %}
+
+{% assign type = 'cors' %}
+
+{% if site.assets.self_host.enabled %}
+  {% if site.assets.self_host.env %}
+    {% if site.assets.self_host.env == jekyll.environment %}
+      {% assign type = 'basic' %}
+    {% endif %}
+  {% else %}
+    {% assign type = 'basic' %}
+  {% endif %}
+{% endif %}
diff --git a/_includes/post-nav.html b/_includes/post-nav.html
new file mode 100644
index 0000000..76bcd59
--- /dev/null
+++ b/_includes/post-nav.html
@@ -0,0 +1,37 @@
+
+
+
+ {% if page.previous.url %} + +

{{ page.previous.title }}

+
+ {% else %} +
+

-

+
+ {% endif %} + + {% if page.next.url %} + +

{{ page.next.title }}

+
+ {% else %} +
+

-

+
+ {% endif %} +
diff --git a/_includes/post-paginator.html b/_includes/post-paginator.html new file mode 100644 index 0000000..668b49f --- /dev/null +++ b/_includes/post-paginator.html @@ -0,0 +1,89 @@ + + +
    + + {% if paginator.previous_page %} + {% assign prev_url = paginator.previous_page_path | relative_url %} + {% else %} + {% assign prev_url = '#' %} + {% endif %} + +
  • + + + +
  • + + + {% assign left_ellipsis = false %} + {% assign right_ellipsis = false %} + + {% for i in (1..paginator.total_pages) %} + {% assign pre = paginator.page | minus: 1 %} + {% assign next = paginator.page | plus: 1 %} + {% assign pre_less = pre | minus: 1 %} + {% assign next_more = next | plus: 1 %} + {% assign show = false %} + + {% if paginator.page == 1 %} + {% if i <= 3 or i == paginator.total_pages %} + {% assign show = true %} + {% endif %} + {% elsif paginator.page == paginator.total_pages %} + {% if i == 1 or i >= pre_less %} + {% assign show = true %} + {% endif %} + {% else %} + {% if i == 1 or i == paginator.total_pages %} + {% assign show = true %} + {% elsif i >= pre and i <= next %} + {% assign show = true %} + {% endif %} + {% endif %} + + {% if show %} + +
  • + + {{- i -}} + +
  • + {% else %} + + {% if i < pre and left_ellipsis == false %} +
  • + ... +
  • + {% assign left_ellipsis = true %} + {% elsif i > next and right_ellipsis == false %} +
  • + ... +
  • + {% assign right_ellipsis = true %} + {% endif %} + {% endif %} + {% endfor %} + + +
  • + {{ paginator.page }} + / {{ paginator.total_pages }} +
  • + + + {% if paginator.next_page_path %} + {% assign next_url = paginator.next_page_path | relative_url %} + {% else %} + {% assign next_url = '#' %} + {% endif %} + +
  • + + + +
  • +
+ diff --git a/_includes/post-sharing.html b/_includes/post-sharing.html new file mode 100644 index 0000000..f607ba2 --- /dev/null +++ b/_includes/post-sharing.html @@ -0,0 +1,35 @@ + + + diff --git a/_includes/read-time.html b/_includes/read-time.html new file mode 100644 index 0000000..9952410 --- /dev/null +++ b/_includes/read-time.html @@ -0,0 +1,37 @@ + + +{% assign words = include.content | strip_html | number_of_words: 'auto' %} + + + +{% assign wpm = 180 %} +{% assign min_time = 1 %} + +{% assign read_time = words | divided_by: wpm %} + +{% unless read_time > 0 %} + {% assign read_time = min_time %} +{% endunless %} + +{% capture read_prompt %} + {{- site.data.locales[include.lang].post.read_time.prompt -}} +{% endcapture %} + + + + + {{- read_time -}} + {{ ' ' }} + {{- site.data.locales[include.lang].post.read_time.unit -}} + + {%- if include.prompt -%} + {%- assign _prompt_words = read_prompt | number_of_words: 'auto' -%} + {%- unless _prompt_words > 1 -%}{{ ' ' }}{%- endunless -%} + {{ read_prompt }} + {%- endif -%} + diff --git a/_includes/refactor-content.html b/_includes/refactor-content.html new file mode 100644 index 0000000..655ecd6 --- /dev/null +++ b/_includes/refactor-content.html @@ -0,0 +1,286 @@ + + +{% assign _content = include.content %} + + + +{% if _content contains '', '' + | replace: '
', '' + | replace: '
', '
' + %} +{% endif %} + + + +{% if _content contains '
' %}
+  {% assign _content = _content
+    | replace: '
', '' + %} +{% endif %} + + +{% if _content contains '', + '' + | replace: + '', + '' + %} +{% endif %} + + +{% assign IMG_TAG = ' + {% if site.img_cdn %} + {% if site.img_cdn contains '//' %} + {% assign _path_prefix = site.img_cdn %} + {% else %} + {% assign _path_prefix = site.img_cdn | relative_url %} + {% endif %} + {% else %} + {% assign _path_prefix = site.baseurl %} + {% endif %} + + + {% if page.img_path %} + {% assign _path = page.img_path | append: '/' | replace: '//', '/' %} + {% assign _path_prefix = _path_prefix | append: _path %} + {% endif %} + + {% for _img_snippet in _img_snippets %} + {% if forloop.first %} + {% assign _img_content = _img_snippet %} + {% continue %} + {% endif %} + + {% assign _left = _img_snippet | split: '>' | first %} + {% assign _right = _img_snippet | remove: _left %} + + {% unless _left contains 'src=' %} + {% continue %} + {% endunless %} + + {% assign _left = _left | remove: ' /' | replace: ' w=', ' width=' | replace: ' h=', ' height=' %} + {% assign _attrs = _left | split: '" ' %} + + {% assign _width = nil %} + {% assign _height = nil %} + {% assign _lqip = nil %} + {% assign _class = nil %} + + {% for _attr in _attrs %} + {% unless _attr contains '=' %} + {% continue %} + {% endunless %} + + {% assign _pair = _attr | split: '="' %} + {% capture _key %}{{ _pair | first }}{% endcapture %} + {% capture _value %}{{ _pair | last | remove: '"' }}{% endcapture %} + + {% case _key %} + {% when 'width' %} + {% assign _width = _value %} + {% when 'height' %} + {% assign _height = _value %} + {% when 'src' %} + {% assign _src = _value %} + {% when 'lqip' %} + {% assign _lqip = _value %} + {% when 'class' %} + {% assign _class = _value %} + {% endcase %} + {% endfor %} + + + {% if _class %} + {% capture _old_class %}class="{{ _class }}"{% endcapture %} + {% assign _left = _left | remove: _old_class %} + {% endif %} + + {% assign _final_src = nil %} + + {% unless _src contains '//' %} + {% assign _final_src = _path_prefix | append: _src %} + {% capture _src_from %}"{{ _src }}"{% endcapture %} + {% capture _src_to %}"{{ _final_src }}"{% endcapture %} + {% assign _left = _left | replace: _src_from, _src_to %} + {% endunless %} + + {% if _lqip %} + {% unless _lqip contains ':' %} + {% assign _final_lqip = _path_prefix | append: _lqip %} + {% capture _lqip_from %}"{{ _lqip }}"{% endcapture %} + {% capture _lqip_to %}"{{ _final_lqip }}"{% endcapture %} + {% assign _left = _left | replace: _lqip_from, _lqip_to %} + {% endunless %} + {% endif %} + + + {% assign _left = _left | replace: 'src=', 'data-src=' %} + {% if _left contains 'class=' %} + {% assign _left = _left | replace: 'class="', 'class="lazyload '%} + {% else %} + {% assign _left = _left | append: ' class="lazyload"' %} + {% endif %} + + + {% if _lqip %} + {% assign _left = _left | replace: ' lqip=', ' data-lqip="true" src=' %} + {% else %} + {% if _width and _height %} + + {%- capture _svg -%} + src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 {{ _width }} {{ _height }}'%3E%3C/svg%3E" + {%- endcapture -%} + {% assign _left = _svg | append: ' ' | append: _left %} + {% assign _class = _class | append: ' shimmer' %} + {% endif %} + {% endif %} + + + {% assign _left = _left | append: ' data-proofer-ignore' %} + + {% if page.layout == 'home' %} + + {%- capture _wrapper_start -%} +
+ {%- endcapture -%} + {% assign _img_content = _img_content | append: _wrapper_start %} + {% assign _right = _right | prepend: '>` is wrapped by `` --> + {% assign _parent = _right | slice: 1, 4 %} + + {% if _parent == '' %} + + {% assign _size = _img_content | size | minus: 1 %} + {% capture _class %} + class="img-link{% unless _lqip %} shimmer{% endunless %}" + {% endcapture %} + {% assign _img_content = _img_content | slice: 0, _size | append: _class | append: '>' %} + {% else %} + + {%- capture _wrapper_start -%} + + {%- endcapture -%} + {% assign _img_content = _img_content | append: _wrapper_start %} + {% assign _right = _right | prepend: '> + {% assign _img_content = _img_content | append: debug | append: IMG_TAG | append: _left | append: _right %} + + {% endfor %} + + {% if _img_content %} + {% assign _content = _img_content %} + {% endif %} + +{% endif %} + + + +{% if _content contains '
' %} + {% assign _code_spippets = _content | split: '
' %} + {% assign _new_content = '' %} + + {% for _snippet in _code_spippets %} + + {% if forloop.last %} + {% assign _new_content = _new_content | append: _snippet %} + + {% else %} + + {% assign _left = _snippet | split: '><' | last%} + + {% if _left contains 'file="' %} + {% assign _label_text = _left | split: 'file="' | last | split: '"' | first %} + {% assign _label_icon = 'far fa-file-code fa-fw' %} + {% else %} + {% assign _lang = _left | split: 'language-' | last | split: ' ' | first %} + {% capture _label_text %}{% include language-alias.html language=_lang %}{% endcapture %} + {% assign _label_icon = 'fas fa-code fa-fw small' %} + {% endif %} + + {% capture _label %} + + {% endcapture %} + + {% assign _new_content = _new_content | append: _snippet + | append: '
' + | append: _label + | append: '
' + | append: '
' + %} + + {% endif %} + + {% endfor %} + + {% assign _content = _new_content %} + +{% endif %} + + + +{% assign heading_levels = '2,3,4,5' | split: ',' %} +{% assign _heading_content = _content %} + +{% for level in heading_levels %} + {% capture mark_start %}{% endcapture %} + + {% assign left = snippet | split: mark_end | first %} + {% assign right = snippet | slice: left.size, snippet.size %} + {% assign left = left | replace_first: '">', '">' | append: '' %} + + {% assign _new_content = _new_content | append: mark_start + | append: left | append: anchor | append: right + %} + + {% endfor %} + + {% assign _heading_content = _new_content %} + + {% endif %} +{% endfor %} + +{% assign _content = _heading_content %} + + +{{ _content }} diff --git a/_includes/related-posts.html b/_includes/related-posts.html new file mode 100644 index 0000000..8476a6d --- /dev/null +++ b/_includes/related-posts.html @@ -0,0 +1,104 @@ + + + +{% assign TOTAL_SIZE = 3 %} + + +{% assign TAG_SCORE = 1 %} + + +{% assign CATEGORY_SCORE = 0.5 %} + +{% assign SEPARATOR = ':' %} + +{% assign score_list = '' | split: '' %} +{% assign last_index = site.posts.size | minus: 1 %} + +{% for i in (0..last_index) %} + {% assign post = site.posts[i] %} + + {% if post.url == page.url %} + {% continue %} + {% endif %} + + {% assign score = 0 %} + + {% for tag in post.tags %} + {% if page.tags contains tag %} + {% assign score = score | plus: TAG_SCORE %} + {% endif %} + {% endfor %} + + {% for category in post.categories %} + {% if page.categories contains category %} + {% assign score = score | plus: CATEGORY_SCORE %} + {% endif %} + {% endfor %} + + {% if score > 0 %} + {% capture score_item %}{{ score }}{{ SEPARATOR }}{{ i }}{% endcapture %} + {% assign score_list = score_list | push: score_item %} + {% endif %} +{% endfor %} + +{% assign index_list = '' | split: '' %} + +{% if score_list.size > 0 %} + {% assign score_list = score_list | sort | reverse %} + {% for entry in score_list limit: TOTAL_SIZE %} + {% assign index = entry | split: SEPARATOR | last %} + {% assign index_list = index_list | push: index %} + {% endfor %} +{% endif %} + + +{% assign less = TOTAL_SIZE | minus: index_list.size %} + +{% if less > 0 %} + {% for i in (0..last_index) %} + {% assign post = site.posts[i] %} + {% if post.url != page.url %} + {% capture cur_index %}{{ i }}{% endcapture %} + {% unless index_list contains cur_index %} + {% assign index_list = index_list | push: cur_index %} + {% assign less = less | minus: 1 %} + {% if less <= 0 %} + {% break %} + {% endif %} + {% endunless %} + {% endif %} + {% endfor %} +{% endif %} + +{% if index_list.size > 0 %} + + +{% endif %} diff --git a/_includes/search-loader.html b/_includes/search-loader.html new file mode 100644 index 0000000..634325b --- /dev/null +++ b/_includes/search-loader.html @@ -0,0 +1,45 @@ + + +{% capture result_elem %} +
+ {title} + +

{snippet}

+
+{% endcapture %} + +{% capture not_found %}

{{ site.data.locales[include.lang].search.no_results }}

{% endcapture %} + + diff --git a/_includes/search-results.html b/_includes/search-results.html new file mode 100644 index 0000000..07981ff --- /dev/null +++ b/_includes/search-results.html @@ -0,0 +1,10 @@ + + +
+
+
+ {% include_cached trending-tags.html %} +
+
+
+
diff --git a/_includes/sidebar.html b/_includes/sidebar.html new file mode 100644 index 0000000..1c81685 --- /dev/null +++ b/_includes/sidebar.html @@ -0,0 +1,104 @@ + + + + diff --git a/_includes/toc.html b/_includes/toc.html new file mode 100644 index 0000000..1eb3dcd --- /dev/null +++ b/_includes/toc.html @@ -0,0 +1,13 @@ +{% assign enable_toc = false %} +{% if site.toc and page.toc %} + {% if page.content contains ' +
{{- site.data.locales[include.lang].panel.toc -}}
+ +
+{% endif %} diff --git a/_includes/topbar.html b/_includes/topbar.html new file mode 100644 index 0000000..0092f69 --- /dev/null +++ b/_includes/topbar.html @@ -0,0 +1,70 @@ + + +
+
+ + {% assign paths = page.url | split: '/' %} + + {% if paths.size == 0 or page.layout == 'home' %} + + {{ site.data.locales[include.lang].tabs.home | capitalize }} + + {% else %} + {% for item in paths %} + {% if forloop.first %} + + + {{ site.data.locales[include.lang].tabs.home | capitalize }} + + + + {% elsif forloop.last %} + {% if page.collection == 'tabs' %} + {{ site.data.locales[include.lang].tabs[item] | default: page.title }} + {% else %} + {{ page.title }} + {% endif %} + + {% elsif page.layout == 'category' or page.layout == 'tag' %} + + + {{ site.data.locales[include.lang].tabs[item] | default: page.title }} + + + {% endif %} + {% endfor %} + {% endif %} + + + + + +
+ {% if page.layout == 'home' %} + {{- site.data.locales[include.lang].title | default: site.title -}} + {% elsif page.collection == 'tabs' or page.layout == 'page' %} + {%- capture tab_key -%}{{ page.url | split: '/' }}{%- endcapture -%} + {{- site.data.locales[include.lang].tabs[tab_key] | default: page.title -}} + {% else %} + {{- site.data.locales[include.lang].layout[page.layout] | default: page.layout | capitalize -}} + {% endif %} +
+ + + + + + + {{ site.data.locales[include.lang].search.cancel }} +
+
diff --git a/_includes/trending-tags.html b/_includes/trending-tags.html new file mode 100644 index 0000000..6b1d732 --- /dev/null +++ b/_includes/trending-tags.html @@ -0,0 +1,46 @@ + + +{% assign MAX = 10 %} + +{% assign size_list = '' | split: '' %} +{% assign tag_list = '' | split: '' %} + +{% for tag in site.tags %} + {% assign size = tag | last | size %} + {% assign size_list = size_list | push: size %} + + {% assign tag_str = tag | first | append: '::' | append: size %} + {% assign tag_list = tag_list | push: tag_str %} +{% endfor %} + +{% assign size_list = size_list | sort | reverse %} + +{% assign tag_list = tag_list | sort_natural %} + +{% assign trending_tags = '' | split: '' %} + +{% for size in size_list limit: MAX %} + {% for tag_str in tag_list %} + {% assign tag = tag_str | split: '::' %} + {% assign tag_name = tag | first %} + {% assign tag_size = tag | last | plus: 0 %} + {% if tag_size == size %} + {% unless trending_tags contains tag_name %} + {% assign trending_tags = trending_tags | push: tag_name %} + {% break %} + {% endunless %} + {% endif %} + {% endfor %} +{% endfor %} + +{% if trending_tags.size > 0 %} +
+
{{- site.data.locales[include.lang].panel.trending_tags -}}
+
+ {% for tag_name in trending_tags %} + {% assign url = tag_name | slugify | url_encode | prepend: '/tags/' | append: '/' %} + + {% endfor %} +
+
+{% endif %} diff --git a/_includes/update-list.html b/_includes/update-list.html new file mode 100644 index 0000000..0ab7a45 --- /dev/null +++ b/_includes/update-list.html @@ -0,0 +1,39 @@ + + +{% assign MAX_SIZE = 5 %} + +{% assign all_list = '' | split: '' %} + +{% for post in site.posts %} + {% if post.last_modified_at and post.last_modified_at != post.date %} + {% capture elem %} + {{- post.last_modified_at | date: "%Y%m%d%H%M%S" -}}::{{- forloop.index0 -}} + {% endcapture %} + {% assign all_list = all_list | push: elem %} + {% endif %} +{% endfor %} + +{% assign all_list = all_list | sort | reverse %} + +{% assign update_list = '' | split: '' %} + +{% for entry in all_list limit: MAX_SIZE %} + {% assign update_list = update_list | push: entry %} +{% endfor %} + +{% if update_list.size > 0 %} +
+
{{- site.data.locales[include.lang].panel.lastmod -}}
+
    + {% for item in update_list %} + {% assign index = item | split: '::' | last | plus: 0 %} + {% assign post = site.posts[index] %} + {% assign url = post.url | relative_url %} +
  • + {{ post.title }} +
  • + {% endfor %} +
+
+ +{% endif %} diff --git a/_javascript/_copyright b/_javascript/_copyright new file mode 100644 index 0000000..dedc8ed --- /dev/null +++ b/_javascript/_copyright @@ -0,0 +1,3 @@ +Chirpy v<%= pkg.version %> (<%= pkg.homepage %>) +© 2019 <%= pkg.author %> +<%= pkg.license %> Licensed diff --git a/_javascript/categories.js b/_javascript/categories.js new file mode 100644 index 0000000..15d8251 --- /dev/null +++ b/_javascript/categories.js @@ -0,0 +1,7 @@ +import { basic, initSidebar, initTopbar } from './modules/layouts'; +import { categoryCollapse } from './modules/plugins'; + +basic(); +initSidebar(); +initTopbar(); +categoryCollapse(); diff --git a/_javascript/commons.js b/_javascript/commons.js new file mode 100644 index 0000000..05a9765 --- /dev/null +++ b/_javascript/commons.js @@ -0,0 +1,5 @@ +import { basic, initSidebar, initTopbar } from './modules/layouts'; + +basic(); +initSidebar(); +initTopbar(); diff --git a/_javascript/home.js b/_javascript/home.js new file mode 100644 index 0000000..70af328 --- /dev/null +++ b/_javascript/home.js @@ -0,0 +1,8 @@ +import { basic, initSidebar, initTopbar } from './modules/layouts'; +import { initLocaleDatetime, imgLazy } from './modules/plugins'; + +basic(); +initSidebar(); +initTopbar(); +initLocaleDatetime(); +imgLazy(); diff --git a/_javascript/misc.js b/_javascript/misc.js new file mode 100644 index 0000000..c7a19d6 --- /dev/null +++ b/_javascript/misc.js @@ -0,0 +1,7 @@ +import { basic, initSidebar, initTopbar } from './modules/layouts'; +import { initLocaleDatetime } from './modules/plugins'; + +basic(); +initSidebar(); +initTopbar(); +initLocaleDatetime(); diff --git a/_javascript/modules/components/back-to-top.js b/_javascript/modules/components/back-to-top.js new file mode 100644 index 0000000..777a659 --- /dev/null +++ b/_javascript/modules/components/back-to-top.js @@ -0,0 +1,20 @@ +/** + * Reference: https://bootsnipp.com/snippets/featured/link-to-top-page + */ + +export function back2top() { + const $window = $(window); + const $btn = $('#back-to-top'); + + $window.on('scroll', () => { + if ($window.scrollTop() > 50) { + $btn.fadeIn(); + } else { + $btn.fadeOut(); + } + }); + + $btn.on('click', () => { + $window.scrollTop(0); + }); +} diff --git a/_javascript/modules/components/category-collapse.js b/_javascript/modules/components/category-collapse.js new file mode 100644 index 0000000..d6027a1 --- /dev/null +++ b/_javascript/modules/components/category-collapse.js @@ -0,0 +1,36 @@ +/** + * Tab 'Categories' expand/close effect. + */ +const childPrefix = 'l_'; +const parentPrefix = 'h_'; +const collapse = $('.collapse'); + +export function categoryCollapse() { + /* close up top-category */ + collapse.on('hide.bs.collapse', function () { + /* Bootstrap collapse events. */ const parentId = + parentPrefix + $(this).attr('id').substring(childPrefix.length); + if (parentId) { + $(`#${parentId} .far.fa-folder-open`).attr( + 'class', + 'far fa-folder fa-fw' + ); + $(`#${parentId} i.fas`).addClass('rotate'); + $(`#${parentId}`).removeClass('hide-border-bottom'); + } + }); + + /* expand the top category */ + collapse.on('show.bs.collapse', function () { + const parentId = + parentPrefix + $(this).attr('id').substring(childPrefix.length); + if (parentId) { + $(`#${parentId} .far.fa-folder`).attr( + 'class', + 'far fa-folder-open fa-fw' + ); + $(`#${parentId} i.fas`).removeClass('rotate'); + $(`#${parentId}`).addClass('hide-border-bottom'); + } + }); +} diff --git a/_javascript/modules/components/clipboard.js b/_javascript/modules/components/clipboard.js new file mode 100644 index 0000000..f803843 --- /dev/null +++ b/_javascript/modules/components/clipboard.js @@ -0,0 +1,123 @@ +/** + * Clipboard functions + * + * Dependencies: + * - popper.js (https://github.com/popperjs/popper-core) + * - clipboard.js (https://github.com/zenorocha/clipboard.js) + */ + +const clipboardSelector = '.code-header>button'; +const ICON_SUCCESS = 'fas fa-check'; +const ATTR_TIMEOUT = 'timeout'; +const ATTR_TITLE_SUCCEED = 'data-title-succeed'; +const ATTR_TITLE_ORIGIN = 'data-bs-original-title'; +const TIMEOUT = 2000; // in milliseconds + +function isLocked(node) { + if ($(node)[0].hasAttribute(ATTR_TIMEOUT)) { + let timeout = $(node).attr(ATTR_TIMEOUT); + if (Number(timeout) > Date.now()) { + return true; + } + } + return false; +} + +function lock(node) { + $(node).attr(ATTR_TIMEOUT, Date.now() + TIMEOUT); +} + +function unlock(node) { + $(node).removeAttr(ATTR_TIMEOUT); +} + +function getIcon(btn) { + let iconNode = $(btn).children(); + return iconNode.attr('class'); +} + +const ICON_DEFAULT = getIcon(clipboardSelector); + +function showTooltip(btn) { + const succeedTitle = $(btn).attr(ATTR_TITLE_SUCCEED); + $(btn).attr(ATTR_TITLE_ORIGIN, succeedTitle).tooltip('show'); +} + +function hideTooltip(btn) { + $(btn).tooltip('hide').removeAttr(ATTR_TITLE_ORIGIN); +} + +function setSuccessIcon(btn) { + let btnNode = $(btn); + let iconNode = btnNode.children(); + iconNode.attr('class', ICON_SUCCESS); +} + +function resumeIcon(btn) { + let btnNode = $(btn); + let iconNode = btnNode.children(); + iconNode.attr('class', ICON_DEFAULT); +} + +export function initClipboard() { + // Initial the clipboard.js object + if ($(clipboardSelector).length) { + const clipboard = new ClipboardJS(clipboardSelector, { + target(trigger) { + let codeBlock = trigger.parentNode.nextElementSibling; + return codeBlock.querySelector('code .rouge-code'); + } + }); + + const clipboardList = document.querySelectorAll(clipboardSelector); + [...clipboardList].map( + (elem) => + new bootstrap.Tooltip(elem, { + placement: 'left' + }) + ); + + clipboard.on('success', (e) => { + e.clearSelection(); + + const trigger = e.trigger; + if (isLocked(trigger)) { + return; + } + + setSuccessIcon(trigger); + showTooltip(trigger); + lock(trigger); + + setTimeout(() => { + hideTooltip(trigger); + resumeIcon(trigger); + unlock(trigger); + }, TIMEOUT); + }); + } + + /* --- Post link sharing --- */ + + $('#copy-link').on('click', (e) => { + let target = $(e.target); + + if (isLocked(target)) { + return; + } + + // Copy URL to clipboard + navigator.clipboard.writeText(window.location.href).then(() => { + const defaultTitle = target.attr(ATTR_TITLE_ORIGIN); + const succeedTitle = target.attr(ATTR_TITLE_SUCCEED); + // Switch tooltip title + target.attr(ATTR_TITLE_ORIGIN, succeedTitle).tooltip('show'); + lock(target); + + setTimeout(() => { + target.attr(ATTR_TITLE_ORIGIN, defaultTitle); + unlock(target); + }, TIMEOUT); + }); + }); +} diff --git a/_javascript/modules/components/img-lazyload.js b/_javascript/modules/components/img-lazyload.js new file mode 100644 index 0000000..edad9dd --- /dev/null +++ b/_javascript/modules/components/img-lazyload.js @@ -0,0 +1,27 @@ +/** + * Set up image lazy-load + */ + +function stopShimmer($node) { + $node.parent().removeClass('shimmer'); +} + +export function imgLazy() { + const $images = $('#core-wrapper img[data-src]'); + + if ($images.length <= 0) { + return; + } + + /* Stop shimmer when image loaded */ + document.addEventListener('lazyloaded', function (e) { + stopShimmer($(e.target)); + }); + + /* Stop shimmer from cached images */ + $images.each(function () { + if ($(this).hasClass('ls-is-cached')) { + stopShimmer($(this)); + } + }); +} diff --git a/_javascript/modules/components/img-popup.js b/_javascript/modules/components/img-popup.js new file mode 100644 index 0000000..7f78d99 --- /dev/null +++ b/_javascript/modules/components/img-popup.js @@ -0,0 +1,22 @@ +/** + * Set up image popup + * + * See: https://github.com/dimsemenov/Magnific-Popup + */ + +export function imgPopup() { + if ($('.popup') <= 0) { + return; + } + + $('.popup').magnificPopup({ + type: 'image', + closeOnContentClick: true, + showCloseBtn: false, + zoom: { + enabled: true, + duration: 300, + easing: 'ease-in-out' + } + }); +} diff --git a/_javascript/modules/components/locale-datetime.js b/_javascript/modules/components/locale-datetime.js new file mode 100644 index 0000000..214f2bf --- /dev/null +++ b/_javascript/modules/components/locale-datetime.js @@ -0,0 +1,51 @@ +/** + * Update month/day to locale datetime + * + * Requirement: + */ + +/* A tool for locale datetime */ +class LocaleHelper { + static get attrTimestamp() { + return 'data-ts'; + } + + static get attrDateFormat() { + return 'data-df'; + } + + static get locale() { + return $('html').attr('lang').substring(0, 2); + } + + static getTimestamp(elem) { + return Number(elem.attr(LocaleHelper.attrTimestamp)); // unix timestamp + } + + static getDateFormat(elem) { + return elem.attr(LocaleHelper.attrDateFormat); + } +} + +export function initLocaleDatetime() { + dayjs.locale(LocaleHelper.locale); + dayjs.extend(window.dayjs_plugin_localizedFormat); + + $(`[${LocaleHelper.attrTimestamp}]`).each(function () { + const date = dayjs.unix(LocaleHelper.getTimestamp($(this))); + const text = date.format(LocaleHelper.getDateFormat($(this))); + $(this).text(text); + $(this).removeAttr(LocaleHelper.attrTimestamp); + $(this).removeAttr(LocaleHelper.attrDateFormat); + + // setup tooltips + const tooltip = $(this).attr('data-bs-toggle'); + if (typeof tooltip === 'undefined' || tooltip !== 'tooltip') { + return; + } + + const tooltipText = date.format('llll'); // see: https://day.js.org/docs/en/display/format#list-of-localized-formats + $(this).attr('data-bs-title', tooltipText); + new bootstrap.Tooltip($(this)); + }); +} diff --git a/_javascript/modules/components/mode-watcher.js b/_javascript/modules/components/mode-watcher.js new file mode 100644 index 0000000..7b2298a --- /dev/null +++ b/_javascript/modules/components/mode-watcher.js @@ -0,0 +1,21 @@ +/** + * Add listener for theme mode toggle + */ +const $toggleElem = $('.mode-toggle'); + +export function modeWatcher() { + if ($toggleElem.length === 0) { + return; + } + + $toggleElem.off().on('click', (e) => { + const $target = $(e.target); + let $btn = + $target.prop('tagName') === 'button'.toUpperCase() + ? $target + : $target.parent(); + + modeToggle.flipMode(); // modeToggle: `_includes/mode-toggle.html` + $btn.trigger('blur'); // remove the clicking outline + }); +} diff --git a/_javascript/modules/components/search-display.js b/_javascript/modules/components/search-display.js new file mode 100644 index 0000000..7862f39 --- /dev/null +++ b/_javascript/modules/components/search-display.js @@ -0,0 +1,122 @@ +/** + * This script make #search-result-wrapper switch to unloaded or shown automatically. + */ +const $btnSbTrigger = $('#sidebar-trigger'); +const $btnSearchTrigger = $('#search-trigger'); +const $btnCancel = $('#search-cancel'); +const $content = $('#main>.row'); +const $topbarTitle = $('#topbar-title'); +const $searchWrapper = $('#search-wrapper'); +const $resultWrapper = $('#search-result-wrapper'); +const $results = $('#search-results'); +const $input = $('#search-input'); +const $hints = $('#search-hints'); +const $viewport = $('html,body'); + +// class names +const C_LOADED = 'loaded'; +const C_UNLOADED = 'unloaded'; +const C_FOCUS = 'input-focus'; +const C_FLEX = 'd-flex'; + +class ScrollBlocker { + static offset = 0; + static resultVisible = false; + + static on() { + ScrollBlocker.offset = window.scrollY; + $viewport.scrollTop(0); + } + + static off() { + $viewport.scrollTop(ScrollBlocker.offset); + } +} + +/*--- Actions in mobile screens (Sidebar hidden) ---*/ +class MobileSearchBar { + static on() { + $btnSbTrigger.addClass(C_UNLOADED); + $topbarTitle.addClass(C_UNLOADED); + $btnSearchTrigger.addClass(C_UNLOADED); + $searchWrapper.addClass(C_FLEX); + $btnCancel.addClass(C_LOADED); + } + + static off() { + $btnCancel.removeClass(C_LOADED); + $searchWrapper.removeClass(C_FLEX); + $btnSbTrigger.removeClass(C_UNLOADED); + $topbarTitle.removeClass(C_UNLOADED); + $btnSearchTrigger.removeClass(C_UNLOADED); + } +} + +class ResultSwitch { + static on() { + if (!ScrollBlocker.resultVisible) { + // the block method must be called before $(#main) unloaded. + ScrollBlocker.on(); + $resultWrapper.removeClass(C_UNLOADED); + $content.addClass(C_UNLOADED); + ScrollBlocker.resultVisible = true; + } + } + + static off() { + if (ScrollBlocker.resultVisible) { + $results.empty(); + if ($hints.hasClass(C_UNLOADED)) { + $hints.removeClass(C_UNLOADED); + } + $resultWrapper.addClass(C_UNLOADED); + $content.removeClass(C_UNLOADED); + + // now the release method must be called after $(#main) display + ScrollBlocker.off(); + + $input.val(''); + ScrollBlocker.resultVisible = false; + } + } +} + +function isMobileView() { + return $btnCancel.hasClass(C_LOADED); +} + +export function displaySearch() { + $btnSearchTrigger.on('click', function () { + MobileSearchBar.on(); + ResultSwitch.on(); + $input.trigger('focus'); + }); + + $btnCancel.on('click', function () { + MobileSearchBar.off(); + ResultSwitch.off(); + }); + + $input.on('focus', function () { + $searchWrapper.addClass(C_FOCUS); + }); + + $input.on('focusout', function () { + $searchWrapper.removeClass(C_FOCUS); + }); + + $input.on('input', () => { + if ($input.val() === '') { + if (isMobileView()) { + $hints.removeClass(C_UNLOADED); + } else { + ResultSwitch.off(); + } + } else { + ResultSwitch.on(); + if (isMobileView()) { + $hints.addClass(C_UNLOADED); + } + } + }); +} diff --git a/_javascript/modules/components/sidebar.js b/_javascript/modules/components/sidebar.js new file mode 100644 index 0000000..9d8567e --- /dev/null +++ b/_javascript/modules/components/sidebar.js @@ -0,0 +1,25 @@ +/** + * Expand or close the sidebar in mobile screens. + */ + +const $body = $('body'); +const ATTR_DISPLAY = 'sidebar-display'; + +class SidebarUtil { + static isExpanded = false; + + static toggle() { + if (SidebarUtil.isExpanded === false) { + $body.attr(ATTR_DISPLAY, ''); + } else { + $body.removeAttr(ATTR_DISPLAY); + } + + SidebarUtil.isExpanded = !SidebarUtil.isExpanded; + } +} + +export function sidebarExpand() { + $('#sidebar-trigger').on('click', SidebarUtil.toggle); + $('#mask').on('click', SidebarUtil.toggle); +} diff --git a/_javascript/modules/components/toc.js b/_javascript/modules/components/toc.js new file mode 100644 index 0000000..dd46994 --- /dev/null +++ b/_javascript/modules/components/toc.js @@ -0,0 +1,13 @@ +export function toc() { + if (document.querySelector('#core-wrapper h2,#core-wrapper h3')) { + // see: https://github.com/tscanlin/tocbot#usage + tocbot.init({ + tocSelector: '#toc', + contentSelector: '.post-content', + ignoreSelector: '[data-toc-skip]', + headingSelector: 'h2, h3', + orderedList: false, + scrollSmooth: false + }); + } +} diff --git a/_javascript/modules/components/tooltip-loader.js b/_javascript/modules/components/tooltip-loader.js new file mode 100644 index 0000000..a906600 --- /dev/null +++ b/_javascript/modules/components/tooltip-loader.js @@ -0,0 +1,12 @@ +/** + * Initial Bootstrap Tooltip. + */ +export function loadTooptip() { + const tooltipTriggerList = document.querySelectorAll( + '[data-bs-toggle="tooltip"]' + ); + + [...tooltipTriggerList].map( + (tooltipTriggerEl) => new bootstrap.Tooltip(tooltipTriggerEl) + ); +} diff --git a/_javascript/modules/layouts.js b/_javascript/modules/layouts.js new file mode 100644 index 0000000..28f7962 --- /dev/null +++ b/_javascript/modules/layouts.js @@ -0,0 +1,3 @@ +export { basic } from './layouts/basic'; +export { initSidebar } from './layouts/sidebar'; +export { initTopbar } from './layouts/topbar'; diff --git a/_javascript/modules/layouts/basic.js b/_javascript/modules/layouts/basic.js new file mode 100644 index 0000000..fb36a8b --- /dev/null +++ b/_javascript/modules/layouts/basic.js @@ -0,0 +1,7 @@ +import { back2top } from '../components/back-to-top'; +import { loadTooptip } from '../components/tooltip-loader'; + +export function basic() { + back2top(); + loadTooptip(); +} diff --git a/_javascript/modules/layouts/sidebar.js b/_javascript/modules/layouts/sidebar.js new file mode 100644 index 0000000..8795693 --- /dev/null +++ b/_javascript/modules/layouts/sidebar.js @@ -0,0 +1,7 @@ +import { modeWatcher } from '../components/mode-watcher'; +import { sidebarExpand } from '../components/sidebar'; + +export function initSidebar() { + modeWatcher(); + sidebarExpand(); +} diff --git a/_javascript/modules/layouts/topbar.js b/_javascript/modules/layouts/topbar.js new file mode 100644 index 0000000..cfcd0ed --- /dev/null +++ b/_javascript/modules/layouts/topbar.js @@ -0,0 +1,5 @@ +import { displaySearch } from '../components/search-display'; + +export function initTopbar() { + displaySearch(); +} diff --git a/_javascript/modules/plugins.js b/_javascript/modules/plugins.js new file mode 100644 index 0000000..fa7a7dd --- /dev/null +++ b/_javascript/modules/plugins.js @@ -0,0 +1,6 @@ +export { categoryCollapse } from './components/category-collapse'; +export { initClipboard } from './components/clipboard'; +export { imgLazy } from './components/img-lazyload'; +export { imgPopup } from './components/img-popup'; +export { initLocaleDatetime } from './components/locale-datetime'; +export { toc } from './components/toc'; diff --git a/_javascript/page.js b/_javascript/page.js new file mode 100644 index 0000000..7b31813 --- /dev/null +++ b/_javascript/page.js @@ -0,0 +1,9 @@ +import { basic, initSidebar, initTopbar } from './modules/layouts'; +import { imgLazy, imgPopup, initClipboard } from './modules/plugins'; + +basic(); +initSidebar(); +initTopbar(); +imgLazy(); +imgPopup(); +initClipboard(); diff --git a/_javascript/post.js b/_javascript/post.js new file mode 100644 index 0000000..9a5a61b --- /dev/null +++ b/_javascript/post.js @@ -0,0 +1,17 @@ +import { basic, initSidebar, initTopbar } from './modules/layouts'; +import { + imgLazy, + imgPopup, + initLocaleDatetime, + initClipboard, + toc +} from './modules/plugins'; + +basic(); +initSidebar(); +initTopbar(); +imgLazy(); +imgPopup(); +initLocaleDatetime(); +initClipboard(); +toc(); diff --git a/_layouts/archives.html b/_layouts/archives.html new file mode 100644 index 0000000..18e95f5 --- /dev/null +++ b/_layouts/archives.html @@ -0,0 +1,36 @@ +--- +layout: page +# The Archives of posts. +--- + +{% include lang.html %} + +{% assign df_strftime_m = site.data.locales[lang].df.archives.strftime | default: '/ %m' %} +{% assign df_dayjs_m = site.data.locales[lang].df.archives.dayjs | default: '/ MM' %} + +
+ +{% for post in site.posts %} + {% capture cur_year %}{{ post.date | date: "%Y" }}{% endcapture %} + + {% if cur_year != last_year %} + {% unless forloop.first %}{% endunless %} +
{{ cur_year }}
+
    + {% assign last_year = cur_year %} + {% endif %} + +
  • + {% assign ts = post.date | date: '%s' %} + {{ post.date | date: "%d" }} + + {{ post.date | date: df_strftime_m }} + + {{ post.title }} +
  • + + {% if forloop.last %}
{% endif %} + +{% endfor %} + +
diff --git a/_layouts/categories.html b/_layouts/categories.html new file mode 100644 index 0000000..0515097 --- /dev/null +++ b/_layouts/categories.html @@ -0,0 +1,138 @@ +--- +layout: page +# All the Categories of posts +--- + +{% include lang.html %} + +{% assign HEAD_PREFIX = 'h_' %} +{% assign LIST_PREFIX = 'l_' %} + +{% assign group_index = 0 %} + +{% assign sort_categories = site.categories | sort %} + +{% for category in sort_categories %} + {% assign category_name = category | first %} + {% assign posts_of_category = category | last %} + {% assign first_post = posts_of_category | first %} + + {% if category_name == first_post.categories[0] %} + {% assign sub_categories = '' | split: '' %} + + {% for post in posts_of_category %} + {% assign second_category = post.categories[1] %} + {% if second_category %} + {% unless sub_categories contains second_category %} + {% assign sub_categories = sub_categories | push: second_category %} + {% endunless %} + {% endif %} + {% endfor %} + + {% assign sub_categories = sub_categories | sort %} + {% assign sub_categories_size = sub_categories | size %} + +
+ +
+ + + + {% capture _category_url %}/categories/{{ category_name | slugify | url_encode }}/{% endcapture %} + {{ category_name }} + + + {% assign top_posts_size = site.categories[category_name] | size %} + + {% if sub_categories_size > 0 %} + {{ sub_categories_size }} + {% if sub_categories_size > 1 %} + {{ + site.data.locales[lang].categories.category_measure.plural + | default: site.data.locales[lang].categories.category_measure + }} + {% else %} + {{ + site.data.locales[lang].categories.category_measure.singular + | default: site.data.locales[lang].categories.category_measure + }} + {% endif -%} + , + {% endif %} + + {{ top_posts_size }} + + {% if top_posts_size > 1 %} + {{ + site.data.locales[lang].categories.post_measure.plural + | default: site.data.locales[lang].categories.post_measure + }} + {% else %} + {{ + site.data.locales[lang].categories.post_measure.singular + | default: site.data.locales[lang].categories.post_measure + }} + {% endif %} + + + + + {% if sub_categories_size > 0 %} + + + + {% else %} + + + + {% endif %} +
+ + + + {% if sub_categories_size > 0 %} +
+
    + {% for sub_category in sub_categories %} +
  • + + + {% capture _sub_ctg_url %}/categories/{{ sub_category | slugify | url_encode }}/{% endcapture %} + {{ sub_category }} + + {% assign posts_size = site.categories[sub_category] | size %} + + {{ posts_size }} + + {% if posts_size > 1 %} + {{ + site.data.locales[lang].categories.post_measure.plural + | default: site.data.locales[lang].categories.post_measure + }} + {% else %} + {{ + site.data.locales[lang].categories.post_measure.singular + | default: site.data.locales[lang].categories.post_measure + }} + {% endif %} + +
  • + {% endfor %} +
+
+ {% endif %} +
+ + + {% assign group_index = group_index | plus: 1 %} + {% endif %} +{% endfor %} diff --git a/_layouts/category.html b/_layouts/category.html new file mode 100644 index 0000000..84fa487 --- /dev/null +++ b/_layouts/category.html @@ -0,0 +1,24 @@ +--- +layout: page +# The Category layout +--- + +{% include lang.html %} + +
+

+ + {{ page.title }} + {{ page.posts | size }} +

+ +
    + {% for post in page.posts %} +
  • + {{ post.title }} + + {% include datetime.html date=post.date wrap='span' class='text-muted small' lang=lang %} +
  • + {% endfor %} +
+
diff --git a/_layouts/compress.html b/_layouts/compress.html new file mode 100644 index 0000000..bb34487 --- /dev/null +++ b/_layouts/compress.html @@ -0,0 +1,10 @@ +--- +# Jekyll layout that compresses HTML +# v3.1.0 +# http://jch.penibelst.de/ +# © 2014–2015 Anatol Broder +# MIT License +--- + +{% capture _LINE_FEED %} +{% endcapture %}{% if site.compress_html.ignore.envs contains jekyll.environment or site.compress_html.ignore.envs == "all" %}{{ content }}{% else %}{% capture _content %}{{ content }}{% endcapture %}{% assign _profile = site.compress_html.profile %}{% if site.compress_html.endings == "all" %}{% assign _endings = "html head body li dt dd optgroup option colgroup caption thead tbody tfoot tr td th" | split: " " %}{% else %}{% assign _endings = site.compress_html.endings %}{% endif %}{% for _element in _endings %}{% capture _end %}{% endcapture %}{% assign _content = _content | remove: _end %}{% endfor %}{% if _profile and _endings %}{% assign _profile_endings = _content | size | plus: 1 %}{% endif %}{% for _element in site.compress_html.startings %}{% capture _start %}<{{ _element }}>{% endcapture %}{% assign _content = _content | remove: _start %}{% endfor %}{% if _profile and site.compress_html.startings %}{% assign _profile_startings = _content | size | plus: 1 %}{% endif %}{% if site.compress_html.comments == "all" %}{% assign _comments = "" | split: " " %}{% else %}{% assign _comments = site.compress_html.comments %}{% endif %}{% if _comments.size == 2 %}{% capture _comment_befores %}.{{ _content }}{% endcapture %}{% assign _comment_befores = _comment_befores | split: _comments.first %}{% for _comment_before in _comment_befores %}{% if forloop.first %}{% continue %}{% endif %}{% capture _comment_outside %}{% if _carry %}{{ _comments.first }}{% endif %}{{ _comment_before }}{% endcapture %}{% capture _comment %}{% unless _carry %}{{ _comments.first }}{% endunless %}{{ _comment_outside | split: _comments.last | first }}{% if _comment_outside contains _comments.last %}{{ _comments.last }}{% assign _carry = false %}{% else %}{% assign _carry = true %}{% endif %}{% endcapture %}{% assign _content = _content | remove_first: _comment %}{% endfor %}{% if _profile %}{% assign _profile_comments = _content | size | plus: 1 %}{% endif %}{% endif %}{% assign _pre_befores = _content | split: "" %}{% assign _pres_after = "" %}{% if _pres.size != 0 %}{% if site.compress_html.blanklines %}{% assign _lines = _pres.last | split: _LINE_FEED %}{% capture _pres_after %}{% for _line in _lines %}{% assign _trimmed = _line | split: " " | join: " " %}{% if _trimmed != empty or forloop.last %}{% unless forloop.first %}{{ _LINE_FEED }}{% endunless %}{{ _line }}{% endif %}{% endfor %}{% endcapture %}{% else %}{% assign _pres_after = _pres.last | split: " " | join: " " %}{% endif %}{% endif %}{% capture _content %}{{ _content }}{% if _pre_before contains "
" %}{% endif %}{% unless _pre_before contains "
" and _pres.size == 1 %}{{ _pres_after }}{% endunless %}{% endcapture %}{% endfor %}{% if _profile %}{% assign _profile_collapse = _content | size | plus: 1 %}{% endif %}{% if site.compress_html.clippings == "all" %}{% assign _clippings = "html head title base link meta style body article section nav aside h1 h2 h3 h4 h5 h6 hgroup header footer address p hr blockquote ol ul li dl dt dd figure figcaption main div table caption colgroup col tbody thead tfoot tr td th" | split: " " %}{% else %}{% assign _clippings = site.compress_html.clippings %}{% endif %}{% for _element in _clippings %}{% assign _edges = " ;; ;" | replace: "e", _element | split: ";" %}{% assign _content = _content | replace: _edges[0], _edges[1] | replace: _edges[2], _edges[3] | replace: _edges[4], _edges[5] %}{% endfor %}{% if _profile and _clippings %}{% assign _profile_clippings = _content | size | plus: 1 %}{% endif %}{{ _content }}{% if _profile %}
Step Bytes
raw {{ content | size }}{% if _profile_endings %}
endings {{ _profile_endings }}{% endif %}{% if _profile_startings %}
startings {{ _profile_startings }}{% endif %}{% if _profile_comments %}
comments {{ _profile_comments }}{% endif %}{% if _profile_collapse %}
collapse {{ _profile_collapse }}{% endif %}{% if _profile_clippings %}
clippings {{ _profile_clippings }}{% endif %}
{% endif %}{% endif %} diff --git a/_layouts/default.html b/_layouts/default.html new file mode 100644 index 0000000..0047570 --- /dev/null +++ b/_layouts/default.html @@ -0,0 +1,76 @@ +--- +layout: compress +# Default layout +--- + + + +{% include origin-type.html %} + +{% include lang.html %} + +{% capture prefer_mode %} + {% if site.theme_mode %} + data-mode="{{ site.theme_mode }}" + {% endif %} +{% endcapture %} + + + + {% include head.html %} + + + {% include sidebar.html lang=lang %} + +
+
+ {% include topbar.html lang=lang %} + {{ content }} + {% include_cached search-results.html lang=lang %} +
+
+ + {% include_cached footer.html lang=lang %} + +
+ + + + {% if site.pwa.enabled %} + + {% endif %} + + {% include js-selector.html %} + + {% if page.mermaid %} + {% include mermaid.html %} + {% endif %} + + {% include_cached search-loader.html %} + + diff --git a/_layouts/home.html b/_layouts/home.html new file mode 100644 index 0000000..4cda9e4 --- /dev/null +++ b/_layouts/home.html @@ -0,0 +1,110 @@ +--- +layout: page +refactor: true +--- + +{% include lang.html %} + +{% assign pinned = site.posts | where: 'pin', 'true' %} +{% assign default = site.posts | where_exp: 'item', 'item.pin != true and item.hidden != true' %} + +{% assign posts = '' | split: '' %} + + + +{% assign offset = paginator.page | minus: 1 | times: paginator.per_page %} +{% assign pinned_num = pinned.size | minus: offset %} + +{% if pinned_num > 0 %} + {% for i in (offset..pinned.size) limit: pinned_num %} + {% assign posts = posts | push: pinned[i] %} + {% endfor %} +{% else %} + {% assign pinned_num = 0 %} +{% endif %} + + + +{% assign default_beg = offset | minus: pinned.size %} + +{% if default_beg < 0 %} + {% assign default_beg = 0 %} +{% endif %} + +{% assign default_num = paginator.posts | size | minus: pinned_num %} +{% assign default_end = default_beg | plus: default_num | minus: 1 %} + +{% if default_num > 0 %} + {% for i in (default_beg..default_end) %} + {% assign posts = posts | push: default[i] %} + {% endfor %} +{% endif %} + +
+ {% for post in posts %} + +
+ {% if post.image %} + {% if post.image.lqip %} + {% capture lqip %}lqip="{{ post.image.lqip }}"{% endcapture %} + {% endif %} + + {% assign src = post.image.path | default: post.image %} + {% unless src contains '//' %} + {% assign src = post.img_path | append: '/' | append: src | replace: '//', '/' %} + {% endunless %} + + {% assign alt = post.image.alt | xml_escape | default: 'Preview Image' %} + + {{ alt }} + {% endif %} + +
+

+ {{ post.title }} +

+ +
+

+ {% include no-linenos.html content=post.content %} + {{ content | markdownify | strip_html | truncate: 200 | escape }} +

+
+ + + +
+ +
+
+ {% endfor %} +
+ + +{% if paginator.total_pages > 1 %} + {% include post-paginator.html %} +{% endif %} diff --git a/_layouts/page.html b/_layouts/page.html new file mode 100644 index 0000000..148f873 --- /dev/null +++ b/_layouts/page.html @@ -0,0 +1,68 @@ +--- +layout: default +--- + +{% include lang.html %} +{% include origin-type.html %} + +{% if layout.tail_includes %} + {% assign has_tail = true %} +{% endif %} + +
+ +
+ {% capture padding %} + {% unless page.layout == 'home' %}px-1{% endunless %} + {% endcapture %} + +
+ {% capture _content %} + {% if layout.refactor or page.layout == 'page' %} + {% include refactor-content.html content=content lang=lang %} + {% else %} + {{ content }} + {% endif %} + {% endcapture %} + + {% if page.layout == 'page' or page.collection == 'tabs' %} + {% assign tab_key = page.title | downcase %} + {% assign title = site.data.locales[lang].tabs[tab_key] | default: page.title %} +

+ {{ title }} +

+
+ {{ _content }} +
+ {% else %} + {{ _content }} + {% endif %} +
+
+ + + +
+
+ {% include_cached update-list.html lang=lang %} + {% include_cached trending-tags.html lang=lang %} +
+ + {% for _include in layout.panel_includes %} + {% assign _include_path = _include | append: '.html' %} + {% include {{ _include_path }} lang=lang %} + {% endfor %} +
+
+ + +{% if has_tail %} +
+
+ {% for _include in layout.tail_includes %} + {% assign _include_path = _include | append: '.html' %} + {% include {{ _include_path }} lang=lang %} + {% endfor %} +
+
+{% endif %} diff --git a/_layouts/post.html b/_layouts/post.html new file mode 100644 index 0000000..77822a6 --- /dev/null +++ b/_layouts/post.html @@ -0,0 +1,133 @@ +--- +layout: page +refactor: true +panel_includes: + - toc +tail_includes: + - related-posts + - post-nav + - comments +--- + +{% include lang.html %} + +

{{ page.title }}

+ + + +
+ {{ content }} +
+ +
+ + + {% if page.categories.size > 0 %} + + {% endif %} + + + {% if page.tags.size > 0 %} + + {% endif %} + +
+
+ + {% if site.data.locales[lang].copyright.license.template %} + + {% capture _replacement %} + + {{ site.data.locales[lang].copyright.license.name }} + + {% endcapture %} + + {{ site.data.locales[lang].copyright.license.template | replace: ':LICENSE_NAME', _replacement }} + + {% endif %} +
+ + {% include post-sharing.html lang=lang %} + +
+ +
diff --git a/_layouts/tag.html b/_layouts/tag.html new file mode 100644 index 0000000..3b90b8c --- /dev/null +++ b/_layouts/tag.html @@ -0,0 +1,23 @@ +--- +layout: page +# The layout for Tag page +--- + +{% include lang.html %} + +
+

+ + {{ page.title }} + {{ page.posts | size }} +

+
    + {% for post in page.posts %} +
  • + {{ post.title }} + + {% include datetime.html date=post.date wrap='span' class='text-muted small' lang=lang %} +
  • + {% endfor %} +
+
diff --git a/_layouts/tags.html b/_layouts/tags.html new file mode 100644 index 0000000..7800ca0 --- /dev/null +++ b/_layouts/tags.html @@ -0,0 +1,22 @@ +--- +layout: page +# All the Tags of posts. +--- + +
+ {% assign tags = '' | split: '' %} + {% for t in site.tags %} + {% assign tags = tags | push: t[0] %} + {% endfor %} + + {% assign sorted_tags = tags | sort_natural %} + + {% for t in sorted_tags %} + + {% endfor %} +
diff --git a/_plugins/posts-lastmod-hook.rb b/_plugins/posts-lastmod-hook.rb new file mode 100644 index 0000000..1fd6ecf --- /dev/null +++ b/_plugins/posts-lastmod-hook.rb @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby +# +# Check for changed posts + +Jekyll::Hooks.register :posts, :post_init do |post| + + commit_num = `git rev-list --count HEAD "#{ post.path }"` + + if commit_num.to_i > 1 + lastmod_date = `git log -1 --pretty="%ad" --date=iso "#{ post.path }"` + post.data['last_modified_at'] = lastmod_date + end + +end diff --git a/_posts/2023-07-18-spring.md b/_posts/2023-07-18-spring.md new file mode 100644 index 0000000..3625bb0 --- /dev/null +++ b/_posts/2023-07-18-spring.md @@ -0,0 +1,2052 @@ +--- +title: Spring +--- + +## Java and Maven Installation Steps (Ubuntu) + +- java 17 is needed for spring framework 6 / spring boot 3 +- download deb file from [here](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html) +- run `sudo apt install ./jdk-17_linux-x64_bin.deb` +- download binary tar.gz file from [here](https://maven.apache.org/download.cgi) +- run `tar xzvf apache-maven-3.9.3-bin.tar.gz` +- add the following to ~/.bashrc - + ```shell + export JAVA_HOME="/usr/lib/jvm/jdk-17" + export PATH="$PATH:$JAVA_HOME/bin/" + + export M2_HOME="~/apache-maven-3.9.3" + export MAVEN_OPTS="-Xms256m -Xmx512m" + export PATH="$PATH:$M2_HOME/bin/" + ``` +- note - when creating projects using start.spring.io, it comes bundled with the maven wrapper + +## Rest + +- evolution of http - http1 ➙ http1.1 ➙ http2 ➙ http3 +- tls is the newer standard and ssl is old (e.g. http3 only supports / uses tls) +- **safe methods** - only fetch information and do not cause changes. e.g. - GET, HEAD (like GET but requests for metadata), OPTIONS (supported http methods by the url), TRACE (echoes the request, helps understand if the request was altered by intermediate servers) +- **idempotent methods** - safe methods, PUT, DELETE (POST is not idempotent) +- status codes - 100 series for informational purpose, 200 series for success, 300 series for redirects, 400 series for client side errors and 500 series for server side errors +- rest - representational state transfer. it is stateless +- **richardson maturity model** - maturity of restful resources. this was probably needed because unlike soap, rest doesn't really have as many standards + - level 0 - swamp of pox - e.g. soap. pox here stands for plain old xml. typically uses just one url and one kind of method + - level 1 - resources - use multiple uris for identifying specific resources. e.g. /products/123 + - level 2 - use http verbs in conjunction with level 1. e.g. POST for creating a product + - level 3 - hateoas - hypermedia as the engine of application state. server returns links in the response to indicate what other actions are available. this helps with the idea of self discovery / self documenting of apis +- **marshalling** (pojo to json) / **unmarshalling** (json to pojo) is done with the help of jackson +- so far, finding [this pdf](https://docs.spring.io/spring-boot/docs/3.2.x/reference/pdf/spring-boot-reference.pdf) good for reference +- spring was introduced by rod johnson as a simpler **alternative to j2ee, thus replacing xml with pojos** +- spring boot is a wrapper around spring, which can do things like auto-configuration e.g. autoconfigure h2 if it is on the classpath, starter dependencies and so on +- **convention over configuration** - there are reasonable defaults, which we can override as needed +- spring boot **has an embedded tomcat server**, which can route requests to the application. earlier, the idea used to be to build war applications (we build jar applications now) and manually deploy them to tomcat servers. tomcat is also called the "servlet container" +- mvc - model view controller. a `DispatcherServlet` running underneath directs requests to / handles responses from the controller +- the controller calls a service, which has the business logic (interacting with db) and returns a model (pojo) +- servlet api is abstracted away from us, but that is what gets used underneath i.e. our requests are sent to servlets that can then forward these requests to our business logic +- the "servlet container" i.e. tomcat is responsible for **converting http requests / response to corresponding servlet request / servlet response** +- we can optionally add **filters** - these can **perform pre / post processing on our servlet requests / servlet responses** - e.g. spring security filters +- so entire flow according to my understanding -
+ ![webmvc architecture](/assets/img/spring/webmvc-architecture.drawio.png) +- `@Service` for service, `@Controller` for controllers +- extend the `CommandLineRunner` interface for initial bootstrapping +- by default in spring boot, package scan happens for any components that are in the same package or inside of any nested packages +- spring context creates components (i.e. instances) via this package scan and holds on to it + ```java + @SpringBootApplication + public class Spring6WebappApplication { + + public static void main(String[] args) { + ApplicationContext ctx = SpringApplication.run(Spring6WebappApplication.class, args); + BookController bookController = ctx.getBean(BookController.class); + } + } + ``` +- we can also autowire the `ApplicationContext` as well +- dependency injection - needed dependency is automatically injected for us. this can be achieved via (3 ways) - + - constructor (instantiation) + - setters + - using field injection i.e. `@Autowired` +- favoured method is using constructor injection with properties marked `private final`. this means the class cannot be instantiated (aka application fails) if the dependency is not available, instead of the dependency causing null pointer exceptions later +- dependency injection works with concrete classes / interfaces (think interface segregation in the i of solid principles) +- inversion of control (2 points) - + - it is the underlying framework that does the heavy lifting for us so that we can focus on the business logic. heavy lifting includes things like instantiation of objects + - allows dependencies to be injected at runtime. the dependencies are not predetermined +- primary beans - if we have two different concrete classes implementing an interface, and we try to use dependency injection for this interface, we get the error **expected single matching bean but found 2**. using `@Primary`, we can ask spring to prefer one of the implementations over another +- we can use `@Qualifier` to specify the bean name explicitly as well. useful when for e.g. we have multiple implementations as described above +- we can also "name" the parameters we want to use dependency injection for correctly. e.g. we have two concrete classes `EnglishGreetingService` and `SpanishGreetingService`. we can use the former using the correct name for the constructor arg + ```java + public Il8NController(GreetingService englishGreetingService) { + this.greetingService = englishGreetingService; + } + ``` +- by default, unless we name the bean, the name used for e.g. for `HelloService` would be `helloService`. we can name beans explicitly as well, e.g. `@Service("bonjourService")` +- profiles - we can annotate a bean with `@Profile` + ```java + @Service + @Profile("EN") + public EnglishHelloService implements GreetingService { } + ``` +- this means that the bean would only be instantiated when that particular profile is active. e.g. - + ```java + @SpringBootTest + @ActiveProfiles("EN") + class IL8NControllerTest { } + ``` +- a bean can be available in multiple profiles - `@Profile({ "EN", "English" })` +- we can also add a bean to be available by default - `@Profile({"EN", "default"})`. this means that if no bean is available, add this bean to the application context. e.g. in this case, use the `EnglishHelloService` implementation when any other bean for the `GreetingService` is not available +- so, we have discussed different techniques to resolve conflicts / to achieve inversion of control - `@Primary`, `@Service`, `@Qualifier`, naming the fields "correctly", `@Profile` (named and default), etc +- bean lifecycle methods - we can hook into the various lifecycle stages that a bean goes through, e.g. when the bean properties are set, when its instantiation is over and so on. we can either implement interfaces like `InitializingBean`, `DisposableBean` or annotations like `@PreDestroy` and `@PostConstruct` +- bean scopes - we can set scope via for e.g. `@Scope(BeanDefinition.SCOPE_PROTOTYPE)`. the different options are - + - **singleton** - it is the default scope of beans, one object per application context + - **prototype** - a new instance is returned every time it is referenced. so, the instance isn't stored in the container. this also means that once an instance is no longer used / referenced, it gets garbage collected + - **web scopes** - for web environments, the instance isn't stored in the container + - **session** - one instance per user per session + - **request** - one instance per http request + - **global session** - one instance per application lifecycle, like singleton +- three lifecycle phases - **initialization**, **use** and **destruction**. steps 1-7 below are for initialization +- note: steps 5 and 6 are done by us manually if we use `@Bean` inside `@Configuration` + 1. **application context is created** + 2. **bean factory is created** + 3. then, **bean definitions are loaded** into the bean factory from all different sources like component scan. the bean factory only contains metadata & references to the beans & has not instantiated them yet + 4. **bean factory post processors** act on the beans to configure them, e.g. fields annotated with `@Value` are set via `PropertySourcesPlaceholderConfigurer`. we can implement `BeanFactoryPostProcessor` if we want, the idea is to configure beans before they are instantiated + 5. **beans are instantiated**, and we do dependency injection using constructors. beans have to be instantiated in the correct order because of the dependency graph + 6. we use **setters** after initialization, e.g. we do dependency injection for setters. in general for good development practice, optional dependencies should use dependency injection via setters while required dependencies should use dependency injection via constructors + 7. **bean post processing** can happen, which is further broker down into 3 steps. note - this is **bean post processing**, step 4 was **bean factory post processing** + 1. pre-init bean post processor - implement `BeanPostProcessor` to call `postProcessBeforeInitialization` + 2. initializer - calls method annotated with `@PostConstruct` + 3. post-init bean post processor - implement `BeanPostProcessor` to call `postProcessAfterInitialization` + 8. **use phase** - application context maintains references to the beans with scope singleton, so they don't get garbage collected etc. we can look into the context anytime by implementing `ApplicationContextAware` and using `setApplicationContext` + 9. **destruction phase** - when close is called on application context. `@PreDestroy` method is called on beans before they are marked for garbage collection +- spring mvc - based on java servlet api, which is blocking. remember **servlet** (servlet container i.e. tomcat, dispatcher servlet, servlet request / servlet response, etc) +- spring webflux uses project reactor and not java servlet api, so it is non blocking +- similarly, `RestTemplate` is the older standard and is on the way to deprecation unlike `WebClient` +- spring works using **proxies** +- proxies wrap a class to add behavior, e.g. transaction proxies +- proxies help in adding behavior without modifying code +- proxies don't act on internal logic like calling private methods +- aspect oriented programming - helps in adding common behavior to many locations +- usually used for **cross cutting concerns** +- spring aop is easier to implement, does runtime weaving +- aspectj is a bit more difficult to implement, does compile time weaving, and has more features +- performance of compile time weaving > runtime weaving +- `JoinPoint` is the code +- `PointCut` is what selects a `JoinPoint` +- `Advice` is what gets applied to `JoinPoint`. three advices have been discussed here - `@Before`, `@AfterReturning` and `@Around` +- example - all methods annotated with `@AspectDebugger` should generate logs + - AspectDebugger.java - + ```java + @Target(ElementType.METHOD) + @Retention(RetentionPolicy.RUNTIME) + public @interface AspectDebugger { + } + ``` + - DebuggingAspect.java - + ```java + @Slf4j + public class DebuggingAspect { + + @Pointcut("@annotation(AspectDebugger)") + public void executeLogging() { + } + + @Before("executeLogging()") + public void logMethodCall(JoinPoint joinPoint) { + log.debug("started executing method: %s, with args: %s\n", + joinPoint.getSignature().getName(), Arrays.toString(joinPoint.getArgs())); + } + + @AfterReturning(value = "executeLogging()", returning = "retVal") + public void logMethodCall(JoinPoint joinPoint, Object retVal) { + log.debug("finished executing method: %s, with return value: %s\n", + joinPoint.getSignature().getName(), retVal); + } + + @Around("executeLogging()") + public Object trackExecutionTime(ProceedingJoinPoint joinPoint) throws Throwable { + Long startTime = System.currentTimeMillis(); + Object retVal = joinPoint.proceed(); + Long endTime = System.currentTimeMillis(); + log.debug("method: %s took: %dms to execute\n", + joinPoint.getSignature().getName(), endTime - startTime); + return retVal; + } + } + ``` +- lombok - code generation at compile time +- enable "annotation processing" in intellij for it to work with lombok +- `@Data` - shortcut for `@Getter`, `@Setter`, `@EqualsAndHashCode`, `@ToString`, `@RequiredArgsConstructor` +- `@NonNull` - throw an exception if null value is passed for field +- `@Value` - immutable variant (i.e. `private final`) of `@Data` +- `@SneakyThrows` - throw checked exceptions without declaring it in the throws clause +- `@Synchronized` - better version of `synchronized` +- `@Log` for java util logger. this is not usually recommended +- `@Slf4j` for slf4j logger. slf4j is actually a generic logging facade which uses logback bts in spring +- we can see the generated implementation inside the target folder (intellij has a decompiler that can parse this .class file for us) +- delombok - with the help of lombok plugin in intellij, we can generate the code for an annotation. this provides us with a starting point +- get list can be done by annotating controller method with `@RequestMapping("/api/v1/beer")` +- get by id - make use of path variable + ```java + @RequestMapping("/api/v1/beer") + public class BeerController { + // ... + @RequestMapping(value = "/{beerId}", method = RequestMethod.GET) + public Beer getBeerById(@PathVariable UUID beerId) { + // ... + ``` +- spring-boot-dev-tools - live reload +- using request body for e.g. for create requests. also, it is a good practice to add the location header, which specifies the id of the newly created object - + ```java + @PostMapping + public ResponseEntity saveBeer(@RequestBody Beer beer) { + Beer savedBeer = beerService.saveBeer(beer); + HttpHeaders headers = new HttpHeaders(); + headers.add(HttpHeaders.LOCATION, "/api/v1/beer/" + savedBeer.getId()); + return new ResponseEntity(headers, HttpStatus.CREATED); + } + ``` +- unit test - test specific sections of code, called code coverage. should execute very fast and in unity i.e. not have external dependencies +- integration test - include the spring context, database and message brokers +- functional test - these tests run against a running instance of the service +- testing pyramid - large number of unit tests, fewer integration and even fewer functional tests +- mock mvc - helps us unit test our controllers +- `@WebMvcTest` - create test splices so that the entire context is not brought up. only the controllers specified are instantiated and not even their dependencies. if we do not specify the controller explicitly, all controllers are instantiated +- we mock the dependencies of the controller using mockito +- **mocks** - predefined answers to the method calls. can assert on executions, e.g. assert it was called with a specific parameter +- **spy** - wrapper around the actual object +- the assertion of execution can be done using `verify` +- **argument matchers** - match the arguments of the execution of mocks. e.g. disallow the predefined response if the matching fails +- **argument captors** - capture the arguments of the execution of mocks +- apart from stubbing response, we can also perform assertions on executions of mocks - + ```java + verify(beerService).updateBeerById(eq(beer.getId()), any(Beer.class)); + ``` +- we can use `ArgumentCaptor` from mockito to help us capture arguments passed to mocks + ```java + ArgumentCaptor id_ = ArgumentCaptor.forClass(UUID.class); + verify(beerService).deleteBeerById(id_.capture()); + assertEquals(id, id_.getValue()); + ``` +- use `@MockBean` for injecting the service mocks into the controller +- we use `jsonpath`, which comes from [jayway jsonpath](https://github.com/json-path/JsonPath) +- we use hamcrest matchers e.g. notice the use of `is` + ```java + @WebMvcTest(controllers = {BeerController.class}) + class BeerControllerTest { + + @Autowired + MockMvc mockMvc; + + @MockBean + BeerService beerService; + + @Test + void getBeerById() throws Exception { + Beer beer = Beer.builder().id(UUID.randomUUID()).build(); + when(beerService.getBeerById(beer.getId())).thenReturn(beer); + + mockMvc.perform(get("/api/v1/beer/" + beer.getId()) + .accept(MediaType.APPLICATION_JSON)) + .andExpect(status().isOk()) + .andExpect(content().contentType(MediaType.APPLICATION_JSON)) + .andExpect(jsonPath("$.id", is(beer.getId().toString()))); + } + } + ``` +- using json path capabilities in assertions - + ```java + .andExpect(jsonPath("$.length()", is(2))) + .andExpect(jsonPath("$[?(@.id == '%s')]", one.getId().toString()).exists()) + .andExpect(jsonPath("$[?(@.id == '%s')]", two.getId().toString()).exists()); + ``` +- spring boot does configure an object mapper for us by default which we should prefer using in our test by autowiring instead of creating a new one so that our tests are closer to the real word scenario. we use this object mapper for creating request body in post requests +- if the request body contains json, we need to provide the content type header as well + ```java + mockMvc.perform(post("/api/v1/beer") + .accept(MediaType.APPLICATION_JSON) + .contentType(MediaType.APPLICATION_JSON) + .content(objectMapper.writeValueAsString(req))) + .andExpect(status().isCreated()) + .andExpect(header().exists("Location")) + .andExpect(header().string("Location", "/api/v1/beer/" + beer.getId())); + ``` +- when testing using mock mvc, `delete("/api/v1/beer/" + id.toString())` can be written as `delete("/api/v1/beer/{beerId}", id.toString())` to make use of positional binding +- we can also auto-configure mock mvc in a non-`@WebMvcTest` (such as `@SpringBootTest`) by annotating it with `@AutoConfigureMockMvc` +- the default error handling mechanism uses `DefaultHandlerExceptionResolver`, `ResponseStatusExceptionResolver` (maybe more?), which extends `AbstractHandlerExceptionResolver` +- we can annotate the methods inside controllers with `@ExceptionHandler` to handle specific exceptions i.e. we provide the annotation the exception it should handle. we can use this in the methods of controllers. the downside of this is that it is scoped to a single controller +- so, we can annotate a class with `@ControllerAdvice` to handle exceptions globally and continue to use `@ExceptionHandler` on the methods of this class + ```java + public class NotFoundException extends RuntimeException {} + + @ControllerAdvice + public class ErrorHandler { + + @ExceptionHandler(NotFoundException.class) + public ResponseEntity handleMethodNotFound() { + return ResponseEntity.notFound().build(); + } + } + ``` +- `@ResponseStatus` - we can annotate "custom exceptions" with this annotation to use a specific status for that exception. understand we cannot change code of existing pre-built exceptions, so this only works for custom exceptions. this way, we can skip the controller advice shown above + ```java + @ResponseStatus(HttpStatus.NOT_FOUND) + public class NotFoundException extends RuntimeException { + } + ``` +- to prevent having too many custom exceptions / no point of having custom exceptions that are only used once, we can use `ResponseStatusException`. it allows us to throw exceptions with a response status + ```java + catch (Exception e) { + throw new ResponseStatusException(HttpStatus.NOT_FOUND, "Foo", e); + } + ``` +- spring boot's `ErrorController` defines how to handle errors, e.g. respond with whitelabel pages in browsers vs json for rest requests. we can configure it using following properties - + ```properties + # whether to include errors attribute - think this includes validation errors? + server.error.include-binding-errors=never + # whether to include exception attribute + server.error.include-exception=false + # whether to include message attribute - think this is for exception message? + server.error.include-message=never + # whether to include stack trace + server.error.include-stacktrace=never + # whether to display error page in browsers + server.error.whitelabel.enabled=true + ``` +- i observed that by setting the `server.error` properties to as verbose as possible, the errors property in the response was pretty decent (i.e. include the error message, field name, etc) +- however, when testing via mock mvc, something like this was not working - + ```java + .andExpect(jsonPath("$.errors.length()", is(2))) + .andExpect(jsonPath("$.errors[?(@.defaultMessage == '%s')]", "must not be blank").exists()) + .andExpect(jsonPath("$.errors[?(@.defaultMessage == '%s')]", "must not be nullable").exists()) + ``` +- i think this is more to do with how mock mvc isn't actually like a full blown integration test. so, to test the validation handling via mock mvc, i did the below - + ```java + MvcResult result = mockMvc.perform(post("/api/v1/beer") + .accept(MediaType.APPLICATION_JSON) + .contentType(MediaType.APPLICATION_JSON) + .content(objectMapper.writeValueAsString(beer))) + .andExpect(status().isBadRequest()) + .andReturn(); + + MethodArgumentNotValidException e = (MethodArgumentNotValidException) result.getResolvedException(); + assertNotNull(e); + List defaultMessages = e.getBindingResult().getFieldErrors("beerName").stream() + .map(DefaultMessageSourceResolvable::getDefaultMessage) + .toList(); + assertEquals(2, defaultMessages.size()); + assertTrue(defaultMessages.contains("must not be null")); + assertTrue(defaultMessages.contains("must not be blanker")); + ``` +- error handling - already discussed earlier - if the exception thrown is annotated with `@ResponseStatus`, it can be handled by `ResponseStatusExceptionResolver`. however, if its not, spring will wrap it around `ServletException`. this is not something mock mvc can handle. so basically, below will not work in such cases - + ```java + MvcResult result = mockMvc.perform(put("/api/v1/beer/{beerId}", beerDto.getId()) + // ... + .andReturn(); + result.getResolvedException() + ``` +- unit testing spring services example - + ```java + @ContextConfiguration(classes = {BeerCSVServiceImpl.class}) + @ExtendWith(SpringExtension.class) + class BeerCSVServiceTest { + + @Autowired + BeerCSVService beerCSVService; + + // ... + ``` +- now, we can use `@MockBean` etc. note how we configure `BeerServiceImpl` but autowire `BeerService` +- rest template - spring automatically autowires a RestTemplateBuilder with sensible defaults for us +- use uri component builder - as we add things like query parameters, we don't have to worry about things like encoding special characters etc, unlike when we directly provide the string url by performing concatenations ourselves +- here we expect the server to return an object of type jpa's Page, and so, we want to deserialize the response into this. now Page is an interface, so we can instead use PageImpl. jackson cannot directly convert to PageImpl (i think this happens because PageImpl does not have the right constructor etc) so we use our own wrapper like below based on ([this](https://stackoverflow.com/a/77316854/11885333)) - + ```java + @JsonIgnoreProperties("pageable") // ignore the pageable property in the response + public class JacksonPage extends PageImpl { + + public JacksonPage(List content, int number, int size, long totalElements) { + super(content, PageRequest.of(number, size), totalElements); + } + } + ``` +- rest template code - note `UriComponentsBuilder`, `ParameterizedTypeReference` + ```java + @Service + @Slf4j + public class BeerClientServiceImpl implements BeerClientService { + + @Override + public Page listBeers(String beerName) { + + UriComponentsBuilder uriComponentsBuilder = UriComponentsBuilder.fromPath("/v1/beer"); + if (beerName != null) uriComponentsBuilder.queryParam("beerName", beerName); + + return restTemplate.exchange( + uriComponentsBuilder.toUriString(), + HttpMethod.GET, + null, + new ParameterizedTypeReference>() { + } + ) + .getBody(); + } + + @Override + public BeerDto getBeerById(UUID beerId) { + + UriComponents uriComponents = UriComponentsBuilder.fromPath("/v1/beer/{beerId}") + .buildAndExpand(beerId); + + return restTemplate.exchange( + uriComponents.toUriString(), + HttpMethod.GET, + null, + new ParameterizedTypeReference() { + } + ) + .getBody(); + } + } + ``` +- note - if we don't really have the need for mapping to a full blown pojo, we can use Map or better JsonNode. JsonNode has methods to parse json and extract different attributes from it etc + ```java + List beerNames = new ArrayList<>(); + response.getBody() + .get("content") + .elements() + .forEachRemaining(beerNode -> beerNames.add(beerNode.get("beerName").asText())); + log.info("response body = [{}]", beerNames); + ``` +- creating a beer - note `HttpEntity` + ```java + @Override + public BeerDto createBeer(BeerDto beerDto) { + ResponseEntity response = restTemplate.exchange( + "/api/v1/beer", + HttpMethod.POST, + new HttpEntity<>(beerDto), + Void.class + ); + URI location = response.getHeaders().getLocation(); + return getBeer(location.getPath()); + } + ``` +- there is a way to unit test rest template using `@RestClientTest`. i am not a fan of so many annotations, so i prefer `@SpringBootTest`, unless i want to do unit testing of services, where i can use `@ExtendWith(SpringExtension.class)`. [my full so answer](https://stackoverflow.com/a/77339935/11885333) + ```java + @Slf4j + @SpringBootTest + class BeerClientServiceImplTest { + + @Autowired + BeerClientService beerClientService; + + @Autowired + ObjectMapper objectMapper; + + @Autowired + RestTemplate beerServiceRt; + + MockRestServiceServer mockServer; + + @BeforeEach + void setUp() { + mockServer = MockRestServiceServer.createServer(beerServiceRt); + } + + @Test + @SneakyThrows + void listBeers() { + Page stubbedResponse = new PageImpl<>( + List.of(BeerDtoMocks.two), PageRequest.of(1, 1), 1 + ); + mockServer.expect(method(HttpMethod.GET)) + .andExpect(requestTo(containsString("/api/v1/beer"))) + .andRespond(withSuccess() + .body(objectMapper.writeValueAsString(stubbedResponse)) + .contentType(MediaType.APPLICATION_JSON)); + + Page response = beerClientService.listBeers(null); + assertEquals(BeerDtoMocks.two.getBeerName(), response.getContent().get(0).getBeerName()); + } + } + ``` +- similarly, to mock post calls (we need to return id in location header) - + ```java + UUID id = UUID.randomUUID(); + URI location = UriComponentsBuilder.fromPath("/api/v1/beer/{beerId}") + .buildAndExpand(id) + .toUri(); + mockServer.expect(method(HttpMethod.POST)) + .andExpect(requestTo(containsString("/api/v1/beer"))) + .andRespond(withAccepted().location(location)); + ``` +- spring 6 introduced [`RestClient`](https://spring.io/blog/2023/07/13/new-in-spring-6-1-restclient/) as an alternative to `RestTemplate`, with fluent api like `WebClient` +- actuator helps us in monitoring and managing our applications through http endpoints +- we can see all available endpoints [here](https://docs.spring.io/spring-boot/docs/current/reference/html/actuator.html#actuator.endpoints) +- adding actuator in spring boot + ```xml + + org.springframework.boot + spring-boot-starter-actuator + + ``` +- by default, all endpoints are enabled but not exposed, only the health endpoint is exposed. to expose all endpoints, use `management.endpoints.web.exposure.include=*` +- we can see the health at /actuator/health +- it would return `{ status: "UP" }` if it works fine +- this endpoint can for e.g. be useful for configuring readiness probe of spring boot applications deployed on kubernetes +- add property `management.endpoint.health.show-details=ALWAYS`, [docs](https://docs.spring.io/spring-boot/docs/current/reference/html/actuator.html#actuator.endpoints.health) to show more details +- we can also add custom health checks to show up when we hit the health endpoint (not discussed) +- we can see arbitrary information about the app at /actuator/info +- inside pom.xml inside `spring-boot-maven-plugin`, add below - + ```xml + + + + build-info + + + + ``` +- this gives build time, version, maven coordinates of the project, etc +- it generates a file at target/classes/META-INF/build-info.properties +- add the plugin below - + ```xml + + pl.project13.maven + git-commit-id-plugin + + ``` +- to enable all git related information like branches, last commit, etc., [add below](https://docs.spring.io/spring-boot/docs/2.6.6/reference/html/actuator.html#actuator.endpoints.info.git-commit-information) + ```properties + management.info.git.mode=full + ``` +- it generates a file at target/classes/git.properties +- we can add custom endpoints to actuator as well (not discussed) +- we can secure the health endpoints using spring security! - e.g. allow all users to access the health endpoint and only users with a role of admin to access other endpoints + ```java + @Configuration + public class SecurityConfig extends WebSecurityConfigurerAdapter { + + @Override + protected void configure(HttpSecurity http) throws Exception { + http.authorizeRequests() + .requestMatchers(EndpointRequest.to(HealthEndpoint.class)).permitAll() + .requestMatchers(EndpointRequest.toAnyEndpoint()).hasRole("ADMIN"); + + http.csrf().and().httpBasic(); + } + } + ``` +- metrics - can integrate with many other monitoring systems like cloudwatch, datadog, prometheus, etc. by using micrometer which is vendor neutral, just like slf4j for logging +- it would return information like jvm memory usage, system cpu usage, etc +- hitting `/actuator/metrics/` will show what all endpoints we can hit, then we can hit them via for instance `/actuator/metrics/application.ready.time` +- opencsv - convert csv records to pojo. define pojo as such - + ```java + @Data + @AllArgsConstructor + @NoArgsConstructor + @Builder + public class BeerCSVRecordDto { + + @CsvBindByName + private Integer row; + + @CsvBindByName(column = "count.x") // specify column name explicitly + private Integer countX; + } + ``` +- now, use the code below - + ```java + File file = ResourceUtils.getFile("classpath:data/beers.csv"); + List records = new CsvToBeanBuilder(new FileReader(file)) + .withType(BeerCSVRecordDto.class) + .build() + .parse(); + ``` +- note - `ResourceUtils` comes from spring, can be used for reading files in classpath easily + +## JPA + +- ddl - data definition language - creating / dropping tables, indices, etc +- dml - data manipulation language - insert, update and delete data +- dql - data query language - retrieving data, joins, aggregations, etc +- dcl - data control language - grant / revoke access +- at its core, jdbc (java database connectivity) is used to interact with sql databases +- jdbc is used to prepare sql statements, bind arguments, scroll through results, etc +- clearly, this is low level api and therefore tedious to work with +- idea is to work with java objects instead of `java.sql.ResultSet` +- **object / relational paradigm mismatch** / **impedance mismatch** - object models and relational models do not work well together out of the box. some examples are - + - granularity - e.g. let us say user has an address (one to one). in java, there would be a separate address class to represent this, and the user class will contain a reference to the address class. in sql, the same user table might have multiple columns for address like state, city, zip code, etc + - inheritance - e.g. we have multiple billing details, credit card and bank account. in java, there would be separate classes representing credit card and bank account, both extending a common super class billing details. sql doesn't support inheritance like this + - identity - == in java is for instance identity. equals in java is for instance equality, where all fields can be compared. equality of two rows in database is done by database identity i.e. comparing only the primary key. all three things are different + - association - in java, we can represent them using object references, e.g. for one to many, the one side would have a list as an object reference, while the many side will only have a single object reference. in sql however, we just have a foreign key constraint +- hibernate is basically an orm (object relational mapper) +- so, this helps with interoperability between java objects and underlying rdbms using metadata +- jpa - jakarta persistence api is a specification. hibernate implements jpa +- other hibernate components - + - hibernate validator - implementation of bean validation (jsr 303) + - hibernate envers - audit trail of data + - hibernate search - uses apache lucene underneath to add text search capabilities + - hibernate ogm (object grid mapper) - reusing hibernate for no sql databases including key value, graph, document, etc + - hibernate reactive - non blocking way of interacting with the database + - hibernate jpamodelgen - static metamodel (discussed later) +- spring data commons - helps unify access to different kinds of data stores, be it relational or no sql, and makes code even more concise +- spring data jpa is a jpa specific implementation of spring data, adding functionality like generating implementations based on interface method names +- other spring data components - + - spring data jdbc - sits on top of spring data. so, it eliminates the magic that spring data jpa might have, but at the same time eliminates boilerplate unlike when interacting with jdbc directly + - spring data rest - exposing spring data repositories as rest resources + - spring data mongodb - for mongodb (document database) + - spring data redis - for redis (key value database) + - spring data neo4j - for neo4j (graph database) +- simple class example with id - + ```java + @Entity + @Data + @AllArgsConstructor + @NoArgsConstructor + public class Message { + + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + private String text; + } + ``` +- `EntityManagerFactory` / `EntityManager` are jpa, while `SessionFactory` / `Session` are specific to hibernate, so i assume we should always try using the former. note the syntax below of starting and committing a transaction + ```java + @Test + public void loadFromStorage() throws Exception { + List messages; + + try (EntityManagerFactory emf = Persistence.createEntityManagerFactory("jpa-one")) { + try (EntityManager em = emf.createEntityManager()) { + em.getTransaction().begin(); + Message message = Message.builder().text("hello world!").build(); + em.persist(message); + em.getTransaction().commit(); + + em.getTransaction().begin(); + messages = em.createQuery("select m from Message m", Message.class).getResultList(); + messages.get(0).setText("updated hello!"); + em.getTransaction().commit(); + } + } + + assertAll( + () -> assertEquals(1, messages.size()), + () -> assertEquals("updated hello!", messages.get(0).getText()) + ); + } + ``` +- using spring data jpa, this is even simpler - + ```java + @Test + public void loadFromStorage() { + Message message = Message.builder().build(); + message.setText("hello spring data jpa!"); + messageDao.save(message); + + Iterable messages = messageDao.findAll(); + assertEquals("hello spring data jpa!", messages.iterator().next().getText()); + } + ``` +- note - performance of spring data is considerably slower than regular hibernate when dealing with very huge amounts of data +- for the most part, we should use / be able to use jpa annotations, coming from jakarta.persistence. we should have to use ones coming from hibernate for specific use cases only +- we can have global annotations which do not need to be put into a specific file, like `@NamedQuery`. we can keep global metadata inside a file package-info.java +- for rapid prototyping, we can set `spring.jpa.hibernate.ddl-auto=update` but for production, prefer using `validate` instead +- to log the sql statements, use `spring.jpa.show-sql=true` or `logging.level.org.hibernate.SQL=DEBUG` (the later will use the logger i.e. have package name etc. before to help maintain the standard log format). for debugging purpose, we can log the values as well i.e. without the property `logging.level.org.hibernate.orm.jdbc.bind=TRACE` set to trace like this, logs will show the sql but not the actual values in statements like insert +- hikari - maintains a connection pool to the database. establishing a connection to the database is a complex / resource intensive operation +- database migration - prior to or in conjunction with the application. help track history, successful vs unsuccessful scripts etc. and thus avoid data loss +- two popular solutions - liquibase (more complex and robust) and flyway +- both have integrations with spring boot (preferred since automated?), maven / gradle plugins and have clis as well +- flyway commands - + - migrate - migrate to latest version + - clean - drop all database objects (NOT FOR PRODUCTION) + - info - print information about migrations + - validate - validate available migrations with applied migrations + - undo - undo the most recently applied migration + - baseline - baseline an existing database i.e. we start using flyway from an intermediary state and not from get go + - repair - repair the schema history tables maintained by flyway +- add the flyway dependency for mysql (version comes from spring boot starter parent) + ```xml + + org.flywaydb + flyway-mysql + + ``` +- files should be inside of resources/db/migration and have the format `V1__init-beer.sql` +- note - if encountering too many problems with h2 vs mysql (e.g. i encountered one with uuid described above), we can use db/migration/\ folder - is it better to just use test containers instead? +- flyway automatically creates the `flyway_schema_history` table for us the first time around and adds these scripts to it as rows + + | installed_rank | version | description | type | script | checksum | installed_by | installed_on | execution_time | success | + | -------------- | ------- | ----------- | ---- | ------------------- | ---------- | ------------ | -------------------------- | -------------- | ------- | + | 1 | 1 | init-beer | SQL | V1\_\_init-beer.sql | -978541020 | SA | 2023-07-22 20:38:03.365998 | 4 | TRUE | + +- my doubt - hopefully, there is some "serious" locking / transaction level that flyway uses. e.g. what if i have horizontally scaled instances - i would not want there to be any consistency issues +- validation - defensive programming +- e.g. do not allow null / white spaces for name - + ```java + @NotNull + @NotBlank + private String beerName; + ``` + and add `@Valid` to the method arguments like so + ```java + public ResponseEntity saveBeer(@Valid @RequestBody BeerDto beer) { + ``` +- we can also apply hibernate validations on our entities (which i don't think is a good practice) and the database type constraints themselves (e.g. column length limits) act as a validation layer as well +- accessing metadata at runtime - we can access the **metadata of our models** at runtime. two options - + - **dynamic metamodel** - using jakarta we get the `EntityManagerFactory` - remember only this - `emf.getMetamodel()`. notice how we get access to the entity and its attributes - + ```java + Metamodel metamodel = emf.getMetamodel(); + Set> managedTypes = metamodel.getManagedTypes(); + ManagedType itemType = managedTypes.iterator().next(); + SingularAttribute idAttribute = itemType.getSingularAttribute("id"); + ``` + - **static metamodel** - hibernate to jpa metamodel generator, using `hibernate-jpamodelgen` dependency. use case - type safe query builder - + ```java + CriteriaBuilder cb = em.getCriteriaBuilder(); + CriteriaQuery query = cb.createQuery(Item.class); + Root fromItem = query.from(Item.class); + Path namePath = fromItem.get(Item_.name); + query.where(cb.like(namePath, cb.parameter(String.class, "pattern"))); + + List items = em.createQuery(query) + .setParameter("pattern", "%Item 1%") + .getResultList(); + ``` +- note - with spring 6, the javax persistence namespace has been renamed to jakarta +- all annotations like `@Id`, `@GeneratedValue`, `@Entity`, etc. come from jakarta.persistence now +- beauty of `CrudRepository` - we can change spring-data-jpa to spring-data-mongodb, without any changes required inside code. this is because it comes from spring-data-commons i believe +- `JpaRepository` extends both `CrudRepository` and `PagingAndSortingRepository` for us, so people usually use this variant +- jpa can generate implementations based on interface method names. some things it supports includes `Like`, `IgnoreCase`, `OrderBy` (with `Asc` / `Desc`), `Distinct`, `LessThan`, `First` / `Top` +- we can return `List`, `Optional`, etc +- the syntax correctness of these methods are verified when the application context loads up +- `@Query` - the method name in this case can be anything +- we can bind parameters by position or by name, and use `@Param` if we bind using name +- we can add the `nativeQuery` to write native sql, but we loose out on portability (swap underlying relational database easily, e.g. integration test vs production) +- `@Query` issue - while this does give more flexibility around writing complex jpql, the correctness of the query is not verified like interface methods i.e. the query will only fail execution when called. maybe because unlike here, jpa has to generate the corresponding concrete implementation in case of interface methods? +- **projections** - spring data jpa can also help change shape of return type instead of using the persistent class as the return type. e.g. we want to fetch less data from database for optimization / exposing less fields to the service layer, etc +- we can use interface or classes for this custom projection +- interface projection - the underlying "proxy class" would be generated by jpa +- **interface projection** has two types - **close projections** and **open projections** +- **close projections** - names of interface methods match the names of the persistent class attributes + ```java + public interface EmployeeView { + + String getFirstName(); + + String getLastName(); + } + ``` +- **open projections** - when we want to do more complex things. notice how we use spel inside `@Value` + ```java + public interface EmployeeView { + + @Value("#{target.firstName} #{target.lastName}") + String getFullName(); + } + ``` +- issue - spring cannot optimize closed projections since it does not know in advance what columns might be required unlike in open projections +- **class projection** - the names of the constructor arguments should match the field names of the persistent class exactly + ```java + @Data + public class EmployeeDto { + private String fullName; + + public EmployeeDto(String firstName, String lastName, String email) { + this.fullName = firstName + " " + lastName; + } + } + ``` +- issue - nesting of projections (e.g. one to many) is not supported by class based projections unlike interface based projections +- for insert, update, delete operations, we can continue using `@Query`, but we also need to add `@Modifying` on top of it +- the automatic generation of implementation based on method names is also supported for delete operations, e.g. `deleteByLevel` +- `deleteByLevel` vs `deleteBulkByLevel` - `deleteByLevel` will first run a query and then delete all objects one by one. this will also thus call "registered callbacks" if any. `deleteBulkByLevel` will run a single jpql query i.e. not load all the elements first, and skip all callbacks +- qbe - **query by example** - allows for dynamic query creation - something we cannot do using techniques like `@Query` / interface method names +- it has three parts - + - **probe** - we set the values used by `ExampleMatcher` in the persistent class + - **`ExampleMatcher`** - provides the rules for matching the properties + - **`Example`** - combines the `ExampleMatcher` and probe +- example of qbe. note - if we do not use `withIgnorePaths`, default values of the probe (e.g. 0 for primitive integer) would be put in the where clause of the sql / jpql for those properties + ```java + User user = new User(); + user.setEmail("@someotherdomain.com"); + + ExampleMatcher matcher = ExampleMatcher.matching() + .withIgnorePaths("level", "active") + .withMatcher("email", match -> match.endsWith()); + + List users = userRepository.findAll(Example.of(user, matcher)); + ``` +- doubt - based on how we are manually setting properties inside for e.g. `withIgnorePaths`, is this a good use case for introducing hibernate-jpamodelgen? +- request param - note how we pass required as false, since it is true by default. use case - e.g. providing pagination related parameters + ```java + public List listBeers(@RequestParam(required = false) Integer pageNumber) { + ``` +- a neat trick - right click on a method -> refactor -> change signature. we can for e.g. add a new argument to the method, e.g. String beerName. we can also provide a default value, e.g. null. this means that the method and all its usage will be appropriately refactored, without us doing this manually in every place +- implementing paging and sorting - + - to repository methods, add an argument of PageRequest - constructed using page number, size, sort object + - repository methods return a Page - contains the content (list of objects), utility methods to go to next / previous page, etc +- implementation - + ```java + // repository + Page findAllByBeerStyle(BeerStyle beerStyle, PageRequest pageRequest); + + // service + PageRequest pageRequest = PageRequest.of( + pageNumber != null && pageNumber > 0 ? pageNumber - 1 : DEFAULT_PAGE_NUMBER, + pageSize != null && pageSize > 0 ? pageSize : DEFAULT_PAGE_SIZE, + Sort.by(Sort.Order.by("beerName"), Sort.Order.by("beerStyle")) + ); + + Page beers = beerRepository.findAllByBeerStyle(beerStyle, pageRequest); + return beers.map(beerMapper::map); // returns new Page by calling map on all elements of page + + // tests - for instance, create a Page object to stub return values + Page beers = new PageImpl<>(List.of(one, two)); + ``` +- **entity type** - they are the persistent classes we use. they have ids (key constraint, identity constraint) and foreign keys for referencing other entity types (referential constraint). they have their own lifecycle and exist independently of other entity types +- **value type** - they belong to another entity type and do not have their own lifecycle. they would not have an identity of their own. some examples of value types - + - address in user. can be represented as **embeddable classes** in jpa + - recall the idea of **weak identities** and **identifying relationships**. e.g. a bid is a weak identity and its **identifying relations** are item and user. so, value types can be represented as a table inside our database as well +- recall - instance identity != instance equality != database identity +- primary keys - should not be null (entity constraint), should be unique (key constraint) and should not be updatable (hibernate does not work well with updatable primary keys) +- due to the restrictions above, and the fact that databases do not "perform optimally" with all types when indexing, it is better to have **surrogate keys** over **natural keys** +- for taking help from jpa to generate surrogate keys, we use `@GeneratedValue` along with `@Id`. otherwise, we will have to take care of assigning identifiers ourselves + - `GenerationType.AUTO` - the default. jpa talks to the underlying database to decide which strategy is the best + - `GenerationType.IDENTITY` - auto incremented primary key column + - `GenerationType.SEQUENCE` - a table is maintained separately, and this is called every time before an insert + - `GenerationType.TABLE` - an extra table called `HIBERNATE_SEQUENCES` is maintained, where there is one row for each entity. this table would be referred to before every insert +- sequence vs auto increment - why we should consider sequence - in case of auto increment, we need to wait for response from the database for ids. in case of sequence, hibernate is "aware" of the id. so, our instances would have an id assigned to them even if the actual insert inside the db has not happened yet (multiple inserts can be batched, which is when this might be useful) +- another option - uuid - for globally unique ids. advantage - is random and fairly unique across systems and databases. disadvantage - more space and is thus less efficient compared to the incremented ids + ```java + @Data + @Builder + @Entity + @AllArgsConstructor + @NoArgsConstructor + public class PersistentBeer { + + @Id + @GeneratedValue + @UuidGenerator // org.hibernate.annotations.UuidGenerator + @Column(columnDefinition = "binary(16)") + private UUID id; + + @Version + private Integer version; + + // ... + } + ``` +- note - had to add the `columnDefinition` because without it, h2 was failing when `ddl-auto` was set to `validate` but mysql works without this as well +- calling methods, like `repo.save(obj)` doesn't always guarantee obj will be updated by jpa, so always use `obj = repo.save(obj)` instead. remember how first level cache is used by jpa etc, so that is where these things probably become important +- override table name using `@Table`. by default, our camel cased classes are converted to snake case. note - sql is case insensitive +- we can also pass properties like schema etc to `@Table` +- `hibernate.auto_quote_keyword` - have hibernate automatically add quotes to reserved keywords which might be used as table / column names. remember that for spring boot, the prefix of `spring.jpa.properties` might come into picture, i.e. `spring.jpa.properties.hibernate.auto_quote_keyword=true` +- we can also use backticks / double quotes explicitly, e.g. `@Table("\"User\"")` +- if for e.g. we need a naming strategy, e.g. prefix all tables names with `CE_`. we can use naming strategy for this - + ```java + public class CENamingStrategy extends PhysicalNamingStrategyStandardImpl { + + @Override + public Identifier toPhysicalTableName(Identifier name, JdbcEnvironment context) { + return new Identifier("CE_" + name.getText(), name.isQuoted()); + } + } + + // ... + properties.put("hibernate.physical_naming_strategy", CENamingStrategy.class.getName()); + ``` +- **dynamic sql generation** - even when we update some columns, we see all columns being updated ie. previous column values itself are used. when using hibernate, when we load our application, hibernate generates crud statements for all our persistent classes and caches them. this way, it does not have to regenerate them entirely every time 🤯. this behavior can be disabled as well. use case - we only update one column, but our sql will try updating all columns by reusing the previous value, but this can become very slow if the table has a lot of columns +- some classes are never updated once created, e.g. bid. hibernate can avoid dirty checking for such classes, thus making it faster. for this, annotate the persistent class with `@Immutable` +- we can create views using `@Subselect` +- we can also have the regular repositories for these to use them - + ```java + @Entity + @Immutable + @Subselect( + value = "select i.ID as ITEMID, i.NAME as NAME, " + + "count(b.ID) as NUMBEROFBIDS " + + "from ITEM i left outer join BID b on i.ID = b.ITEM_ID " + + "group by i.ID, i.NAME" + ) + @Synchronize({ "ITEM", "BID" }) + public class ItemBidSummary { + + @Id + private Long itemId; + + private String name; + + private long numberOfBids; + } + ``` +- why we should mention table names inside `@Synchronize` - this way, hibernate knows to **flush the updates** for these views before running the query +- so, remember the three annotations along with `@Entity` for views - `@Immutable`, `@Subselect`, `@Synchronize` +- primitive java types, their corresponding wrapper types and most java datetime related types can be directly converted by hibernate to corresponding sql types +- otherwise, if the property extends java.io.Serializable, the property is stored in its serialized form. this can have many issues - + - serialization / deserialization is costly + - if the application is demised, the class is no longer available and therefore the data in the database can no longer be interpreted +- **transient** - some properties need not be persisted. e.g. we might want to store `initialPrice` but not `initialPriceAfterTax`. we can use either the java `transient` keyword, or `@Transient` +- checks can be done using multiple ways. just stay consistent - + - hibernate validator, e.g. `@NotNull`. can help us validate at presentation layer. also, if using hibernate for ddl generation, this annotation would be ignored + - jpa / hibernate annotations, e.g. `@Column(nullable = false)`. exception would be thrown by jpa before the insert / update statement is executed. also, if using hibernate for ddl generation, this annotation would be factored in + - advantage - exception is thrown by hibernate itself without hitting database, thus performant + - disadvantage - duplication of logic if similar constraints are present in ddl as well + - relying on database having `not null` defined for columns. in this case, a constraint violation exception would be thrown by the database + - disadvantage - we lose out on flexibility, since changing constraints requires ddl + - advantage - data integrity guarantees for consumers using this data directly +- annotate properties with `@Generated`, so that hibernate knows that these values are generated by the database, and that hibernate needs to make "an extra round trip" after inserting / updating these entities to fetch the new value, by calling a new select +- jpa / hibernate handle usual java to sql type mapping, e.g. Integer / int in java to integer in sql, long / Long in java to bigint in sql, etc +- the idea is while there are some defaults, we can provide more specific values, for e.g. precision and scale for numeric types, length of string for varchar types, etc. not only that, based on what length we specify, hibernate can also decide the corresponding type for mysql - longtext, mediumtext. similarly, for byte[], it can choose tinyblob, mediumblob and longblob +- my understanding - we can lazy load large data types by annotating using `@Basic(fetch = FetchType.LAZY)`! +- to adjust whether we want to save only date, only timestamp or both date and timestamp, we can use `@Temporal`. default is `@Temporal(TemporalType.TIMESTAMP)`, but we can use just `DATE` / `TIME` +- enums - by default, if we don't add the annotation `@Enumerated(EnumType.STRING)`, the ordinal position will be used. issue - if we introduce a new value, it might affect the position of the existing enum values, thus making our data go haywire +- **property access** - jpa can either access the properties directly via fields, or via getter and setter methods. good practice - let everything use fields. if we need the persistence layer to go via getters and setters, we can do it as follows - + ```java + @Access(AccessType.PROPERTY) // the other one is AccessType.FIELD + private String name; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name.startsWith("AUCTION: ") ? name : "AUCTION: " + name; + } + ``` +- my understanding - the above can also be achieved using `@ColumnTransformer`, in which case we deal with sql instead of java code +- derived properties - calculated at runtime using sql. these are calculated every time the item is "retrieved from the database". so, do consider values getting outdated. doubt - can `@Synchronize` discussed earlier help with this? also, obviously these properties would be ignored in insert and update statements + ```java + @Formula("(select avg(b.amount) from bid b where b.item_id = id)") + private BigDecimal averageBidAmount; + ``` +- **custom converter** - e.g. we want to support a special type for currencies in our object model, but this of course might not be supported by the relational database we use. so, we can use custom converters (remember `@Convert`, `AttributeConverter` and `@Converter`) - + ```java + // target, as seen by object model + class MonetaryAmount implements Serializable { + + private BigDecimal value; + + private Currency currency; + } + + // object model type to relation model type interconversion + @Converter + class MonetaryAmountConverter implements AttributeConverter { + + @Override + public String convertToDatabaseColumn(MonetaryAmount monetaryAmount) { + return monetaryAmount.toString(); + } + + @Override + public MonetaryAmount convertToEntityAttribute(String s) { + String[] split = s.split(" "); // 35.61 USD + return new MonetaryAmount( + new BigDecimal(split[0]), + Currency.getInstance(split[1]) + ); + } + } + + // declaring the attribute + @Convert(converter = MonetaryAmountConverter.class) + @Column(name = "price", length = 63) + private MonetaryAmount buyNowPrice; + ``` +- create and update timestamps - + ```java + @CreationTimestamp(source = SourceType.DB) + private LocalDateTime createdDate; + + @UpdateTimestamp(source = SourceType.DB) + private LocalDateTime updateDate; + ``` +- my understanding - the default is using jvm's time, which might be an issue, since for e.g. for a horizontally scaled application the clocks might not be synchronized. disadvantage here is every insert would then not be "buffered" and have to be flushed immediately, just like generation strategy of identity vs sequence? +- embeddable - recall two kinds of **association** - **composition** and **aggregation**. _embeddable means composition_ +- so, embeddable entities - + - do not have their own identity. primary key is owning entity's primary key + - when owning entity is deleted or saved, same operation is carried out on embeddable entity + - it does not have a lifecycle of its own +- e.g. user (owning) and address - + ```java + @Embeddable + public class Address { + + private String street; + } + + @Entity + public class User { + + @Id + @GeneratedValue + private Long id; + + private String username; + + // note - no annotation needed here + private Address homeAddress; + } + ``` +- different approaches for inheritance have been discussed now - +- **mapped superclass** - mapping all subclasses to different tables + ```java + @MappedSuperclass + public class BillingDetails { + + @Id + @GeneratedValue(strategy = GenerationType.AUTO) + private Long id; + + private String owner; + } + + @Entity + public class BankAccount extends BillingDetails { + + private String account; + + private String bankName; + } + + @Entity + public class CreditCard extends BillingDetails { + + private String number; + + private String exp; + } + ``` +- output -
+ ![mapped superclass](/assets/img/spring/mapped-superclass.png) +- optionally, we could have made `BillingDetails` abstract +- also, to override properties of superclass from the subclass, we can use `@AttributeOverride`, e.g. modify the column name `owner` to `cc_owner` for the credit card table - + ```java + @AttributeOverride( + name = "owner", + column = @Column(name = "cc_owner") + ) + ``` +- this logic around mapped superclass can be extended to repositories as well. note how we use 1. generics and 2. `@NoRepositoryBean`. then, we can have specific methods in subclass dao / generic methods in superclass dao + ```java + @NoRepositoryBean + public interface BillingDetailsDao extends JpaRepository { + + Optional findByOwner(String owner); + } + + public interface CreditCardDao extends BillingDetailsDao { + + Optional findByNumber(String number); + } + + public interface BankAccountDao extends BillingDetailsDao { + } + ``` +- tips with mapped superclass - + - problem - doesn't work with polymorphic associations - we cannot have other entities reference `BillingDetails` / `BillingDetails` cannot reference other entities. this is because `BillingDetails` itself is not a concrete table + - when to use - for top level classes, when further modifications / changes in future are unlikely +- we can instead use **table per class** +- minute changes to code + - add `@Entity` to `BillingDetails` + - replace `@MappedSuperclass` with `@Inheritance(strategy = InheritanceType.TABLE_PER_CLASS)` + ```java + @Entity + @Inheritance(strategy = InheritanceType.TABLE_PER_CLASS) + public abstract class BillingDetails { + // ... + ``` + - remove `@NoRepositoryBean` from `BillingDetailsDao` +- advantage of table per class - supports foreign key +- my understanding - internally, table per class can do a "union of the tables of the subclasses" when querying the superclass. this is not supported when using mapped superclass. e.g. a user has a list of messages - and a message can of type sms, email, etc. so, we can use table per class for message class, and this way, while we see different tables in the relational database for different subclasses, we can have associations to our message class +- what above means i think is that in jpql, we can write `select * from BillingDetails` in table per class, but not in mapped superclass +- remember to create the `BillingDetails` as an abstract class, otherwise a new table for `BillingDetails` was being created +- probably because of how things work, another feature - we can now have foreign keys for a generic `BillingDetails`, i could see a common sequence table - billing_details_seq for both bank_account and credit_card. so, important - does this mean that there can be foreign keys to `BillingDetails` i.e. abstract class when using table per class, but not when using mapped superclass? +- so, it feels like table per class could be desirable for actual polymorphism cases, while invalid when we are just trying to move properties like create and update timestamp, id, etc to a common class, in which case mapped superclass is better +- **single table** hierarchy - a single table is used for representing the superclass, which has all the columns from all the subclasses +- a column for discriminating is used (default is dtype) - this helps determine which subclass a row belongs to +- code - only change is strategy + ```java + @Entity + @Inheritance(strategy = InheritanceType.SINGLE_TABLE) + public abstract class BillingDetails { + // ... + ``` +- output -
+ ![single table](/assets/img/spring/single-table.png) +- advantage - reporting, gains in performance since no unions etc is involved, schema evolution is straight forward, etc +- disadvantage - data integrity, e.g. cannot enforce not null for columns of subclasses at database level (we can use validation techniques however). there is also a denormalization involved here +- when using repositories of subclasses, hibernate will automatically add filtering logic - `where dtype = 'BankAccount'` for us bts +- we can of course use the base class in jpql (since the table is of base class after all) +- **joined** - this strategy will have tables for all subclasses and superclasses +- so, there would be joins involved - the id column in the subclasses (e.g. bank_account below) is both a primary key and a foreign key reference to the superclass (e.g. billing_details below) +- hibernate knows how to perform the joins for us +- code - only change is strategy + ```java + @Entity + @Inheritance(strategy = InheritanceType.JOINED) + public abstract class BillingDetails { + // ... + ``` +- output -
+ ![joined](/assets/img/spring/joined.png) +- e.g. if i run `billingDetailsDao.findAll()`, the sql run is as below. note the left join and the `case when` clause which helps hibernate determine which subclass it might map to + ```sql + select + b1_0.id, + case + when b1_1.id is not null then 1 + when b1_2.id is not null then 2 + end, + -- other columns + from + billing_details b1_0 + left join bank_account b1_1 on b1_0.id = b1_1.id + left join credit_card b1_2 on b1_0.id = b1_2.id + ``` +- e.g. if i run `bankAccountDao.findAll()`, the sql run is as below. note the normal (inner?) join + ```sql + select + b1_0.id, + -- other columns + from + bank_account b1_0 + join billing_details b1_1 on b1_0.id = b1_1.id + ``` +- disadvantage - joins are involved, thus taking a performance hit +- imagine our legacy system has two tables - author and author_details. however, in our new domain models, we would like to see it as one class + ![secondary table](/assets/img/spring/secondary-table.png) +- we can map the above using `@SecondaryTable`. note how we mention the `PrimaryKeyJoinColumn`, because the default was otherwise id i.e. the same column name as that of author table + ```java + @Entity + @SecondaryTable( + name = "author_details", + pkJoinColumns = @PrimaryKeyJoinColumn(name = "author_id") + ) + @Data + @AllArgsConstructor + @NoArgsConstructor + public class Author { + + @Id + @GeneratedValue + private Long id; + + private String name; + + @Column(table = "author_details") + private Instant dob; + + @Column(table = "author_details") + private String countryOfOrigin; + } + ``` +- java collections framework works well with hibernate +- we can use `ElementCollection`. i think that the point is that the child entity is owned by the parent i.e. "composition". features like cascading of persistence, deletion, etc follow. the child object need not be marked with `@Entity` itself. i do not see any real upside of this over the usual `OneToMany` etc annotations by making the child as an `@Entity`, so skipping it for now. we get much more fine grained control this way +- considerations when writing implementations for associations - + - we should always (as a best practice and as a requirement by jpa) use interfaces like `java.util.Set` instead of concrete implementations + - hibernate has its own collection classes for associations like one to many, which helps it with **dirty checking**. so basically, our collection instances are wrapped with these hibernate collections to help with dirty checking etc + - we should consider initializing with an empty collection's concrete implementation to avoid null checks / null pointer exceptions for newly created entities + - when creating bidirectional links, we need to carry out two steps for linking both sides, so, we can also add convenience methods like so - + ```java + public void addBid(Bid bid) { + bids.add(bid); + bid.setItem(this); + } + ``` +- many to one - this is the simplest, directly maps to the foreign key column. default column name used by jpa below is `item_id`. also, notice how we override the fetch type, since the default is eager + ```java + @ManyToOne(fetch = FetchType.LAZY) + private Item item; + ``` +- we can override the foreign key column name using `@JoinColumn` +- we can make this bidirectional, by mapping the one to many side as well. `getBids` will automatically fetch all the bids for an item for us +- one to many - using the `mappedBy` column, we tell hibernate that "load using the foreign key already specified inside the `item` property of `Bid`". the default fetch type is lazy. + ```java + @OneToMany(mappedBy = "item") + private Set bids = new HashSet<>(); + ``` +- it is common to set the cascade option on the `OneToMany`. in this case, we would want to cascade persist and remove +- `orphanRemoval = true` (false by default) tells hibernate that a bid should be deleted if it is removed from an item's collection. understand how this is different from remove cascade - cascade only ensures calls to delete bids are made when we call delete item + ```java + @OneToMany( + mappedBy = "item", + cascade = {CascadeType.PERSIST, CascadeType.REMOVE}, + orphanRemoval = true + ) + private Set bids = new HashSet<>(); + ``` +- note - my understanding - another difference between using `ElementCollection` vs `OneToMany` is that when we do for e.g. collection.clear() in the prior, a single database statement is issued, while deletes happen one by one in the later. so is it safe to assume that relying on cascade when deleting huge chunks of data is not a feasible option, and we should use some custom jpql / try using `deleteBulk` variants? +- another option - when specifying foreign keys, some sql databases support the `on delete cascade` clause. this way, when an item is deleted, its bids are deleted automatically by the database itself. we can tell hibernate about this using - + ```java + @OneToMany( + mappedBy = "item", + cascade = {CascadeType.PERSIST, CascadeType.REMOVE}, + orphanRemoval = true + ) + @OnDelete(action = OnDeleteAction.CASCADE) + private Set bids = new HashSet<>(); + ``` +- as soon as i comment out the OnDelete line, i see a delete statement for each bid of an item, but with that, i only see one delete statement in the output. is my assumption wrong - i can get rid of the `CascadeType.REMOVE` line with `OnDelete`? +- **cascading state transitions** - entities are independent by default. however, we might want for e.g. bids to be persisted when an item is persisted, bids to be deleted when an item is deleted. for this, we already saw - `CascadeType.PERSIST`, `CascadeType.REMOVE`. along with that, we have `orphanRemoval` to delete a bid removed from `item#bids` and finally, remember our ddl can contain `on delete cascade` +- some lombok specific annotations worth adding to one to many - + ```java + @Builder.Default + @EqualsAndHashCode.Exclude + @ToString.Exclude + ``` +- it might be more feasible to use `@Embeddable` for one to one associations. use one to one when we need to track the entity lifecycle separately i.e. if there are **shared references**. meaning - if a user just has a billing address and a shipping address, address can be marked as an embeddable. lets say another entity shipment has an address as well. we might want a shipment and a user to maintain reference to the same address instance. in this case, OneToOne becomes more feasible +- sometimes, when having one to one mapping, people end up using the same primary key for both tables. in this case, we can use the `@PrimaryKeyJoinColumn` +- normally, we would map one to one mapping using a separate foreign key / surrogate key combination, which is when we can use `@OneToOne` +- lastly, if we would like to track one to one relationships via a separate table, we can use the `@JoinTable` annotation. some use cases i can think of + - the relation itself (and not one of the entities) has some attributes + - storing nulls for foreign keys can be troublesome sometimes. so, it might be better to store all possible relations if any in a separate table altogether +- for one to many side, when defining the field, our options are (recall how it is advisable to use java collections interface on lhs, and not concrete implementations) - + - sets (`Set`) - no duplicates, no order + - lists (`List`) - duplicates, order + - bags (`Collection`) - duplicates, no order +- so based on above, for performance, the best type to use is bags. both de duping and maintaining order are expensive operations for hibernate + ```java + private Collection bids = new ArrayList<>(); + ``` +- disadvantage - we cannot eager fetch two or more collections of bags simultaneously, because it results in a **cartesian product** (discussed later) +- again to customize column names etc, the many side of one to many relation can have the `@JoinColumn`, while the one side will have the `mappedBy` to indicate it is not the owning side of the relationship +- my understanding of list - probably, using `List` instead of `Collection` never makes sense, unless we want to use `@OrderColumn`. this annotation basically orders elements inside the list and maintains the index of the element in a separate column of the table via the column name specified in the `@OrderColumn` (note - of course, `@OrderColumn` would be present on the field having the `@OneToMany`). now, this results in a performance degradation - hibernate will all the time do the reordering when we insert an element to the list etc (e.g. inserting / deleting element not at the ends of the list can be an o(n) operation). so, we might be better off just treating order as a separate field using `@Column`, forget about `@OrderColumn`, and let the ui do the grunt work of sorting / maintaining this order. now, we can use `Collection` instead of `List`. however, if one must - + ```java + // ... + @OneToMany(mappedBy = "item") + @OrderColumn(name = "bid_rank") + private List bids = new ArrayList<>(); + + // ... + @ManyToOne + private Item item; + ``` +- output -
+ ![order column](/assets/img/spring/order-column.png) +- again, we can have a `@JoinTable` in case the one to many is optional / the relationship itself has some attributes, and moving them to the many side is logically incorrect + ```java + // ... + @OneToMany(mappedBy = "item") + @OrderColumn(name = "bid_rank") + private List bids = new ArrayList<>(); + + // ... + @ManyToOne + @JoinTable( + name = "item_bids", + joinColumns = {@JoinColumn(name = "bid_id")}, + inverseJoinColumns = {@JoinColumn(name = "item_id")} + ) + private Item item; + ``` +- output -
+ ![join table one to many](/assets/img/spring/join-table-one-to-many.png) +- many to many - one side can just have `mappedBy` for the `@ManyToMany`, the other side can define the `@JoinTable` + ```java + // ... + @ManyToMany + @JoinTable( + name = "item_categories", + joinColumns = {@JoinColumn(name = "item_id")}, + inverseJoinColumns = {@JoinColumn(name = "category_id")} + ) + private Collection categories = new ArrayList<>(); + + // ... + @ManyToMany(mappedBy = "categories") + private Collection items = new ArrayList<>(); + ``` +- output -
+ ![many to many](/assets/img/spring/many-to-many.png) +- cascading options of remove might not make sense for many to many +- using an intermediate table to track the join table using a separate entity altogether. we can use `@EmbeddedId` to track the composite key. jpa does not pass without setting insertable / updatable to false and specifying column name explicitly inside the `Id` class + ```java + @Entity + @Data + @AllArgsConstructor + @NoArgsConstructor + @Builder + public class ItemCategories { + + @EmbeddedId + private Id id; + + @ManyToOne + @JoinColumn(insertable = false, updatable = false) + private Item item; + + @ManyToOne + @JoinColumn(insertable = false, updatable = false) + private Category category; + + private String linkedBy; + + @Data + @AllArgsConstructor + @NoArgsConstructor + @Builder + private static class Id implements Serializable { + + @Column(name = "category_id") + private Long categoryId; + + @Column(name = "item_id") + private Long itemId; + } + } + + // ... + + @OneToMany(mappedBy = "item") + private Collection itemCategories = new ArrayList<>(); + + // ... + + @OneToMany(mappedBy = "category") + private Collection itemCategories = new ArrayList<>(); + ``` +- output of `show create table item_categories` -
+ ![many to many with entity](/assets/img/spring/many-to-many-with-entity.png) +- note - we do not have to touch the id column for the most part - we will just call `setItem` / `setCategory`, and let hibernate do the rest for us +- **entity states** - + - **transient** - when we create a new instance using the `new` operator, the instance is in transient state i.e. it would be lost when no longer referenced. a transient instance will become persistent in multiple ways - e.g. `EntityManager#persist` is called on it directly, or there is a cascading operation from another instance which references this transient instance, etc + - **persistent** - it has a representation in the database. it has a primary key / id set. an instance can become persistent in multiple ways - via `EntityManager#persist`, or it is fetched using a query directly, fetched due to for e.g. lazy loading, etc. persistent instances are always associated with a persistent context + - **removed** - an entity can be deleted from the database in multiple ways - via `EntityManager#remove`, removed via orphan removal, etc + - **detached** - e.g. we find an entity using `EntityManager#find`, and then close the persistence context. our application logic still has a handle to this instance. the instance is now in detached state. we can make modifications on this instance and call `merge` later using a new `EntityManager` i.e. a detached instance from one persistence context can be merged into another persistence context +- **persistence context** - a persistence context is created when we call `EntityManager em = emf.createEntityManager()`, and closed when we call `em.close()` +- when persistence context is closed (`em.getTransaction().commit()`?), hibernate performs **dirty checking** to get the changes made by application +- then, it performs a sync with the underlying database using right dml. this sync process is called **flushing**. we can also call `em.flush()` manually when needed to achieve the same? +- e.g. hibernate can perform the flush before a query to ensure the updated data is reflected in the query +- the persistence context also represents a *unit of work* +- the persistence context also acts as a **first level of cache** - if an entity is queried "again" in a persistence context, the same instance is returned again instead of hitting the database again. this way, during the entire unit of work i.e. inside the persistence context, the entity seen is the same everywhere, and then after the end, the entity can be safely written to the database +- recall impedance mismatch - so, based on above, hibernate guarantees instance identity, therefore instance equality and database identity both automatically. to validate - will this be true then - `repo.findById(1) == repo.findAll().findFirst(where id = 123)` + - does this mean we do not have to override equals? we should, and that too using a **business key** (discussed later) +- persistence context is scoped to a thread +- my understanding, tying things together - when we call `EntityManager#persist`, the instance goes into persistent state. **during this, hibernate has to assign an identity to the instance**. now, if we use something like auto increment, hibernate has to actually perform the insert into the database. if we do not use sequence generator etc, hibernate can delay this execution till flushing! +- by techniques like delaying flushing dml to the end, batching, etc, hibernate ensures that the database locks are acquired for a short duration (database locks are needed for write operations) +- **lazy** - further, when we for e.g. run `Item item = em.getReference(Item.class, itemId);`, hibernate does not immediately run the sql. the id of the item instance is initialized (since we provided it) but other properties are not. the item object is like a proxy, and the sql is not run until another property is accessed, e.g. `item.getName()` +- if for e.g. we try to access `item.getName()` after closing the persistence context, we will get a `LazyInitializationException` +- **refresh** - e.g. "someone else" makes changes to the database. we can cause hibernate to refetch our instance using `em.refresh(item)` +- one seemingly clever approach - override the equals method to use the database identifier for equality. disadvantages - + - multiple transient instances added to a set will coalesce into one, since all have their id set as null + - when we call save on transient instances in a set, since their id changes, their hash code changes, and therefore they break the collection +- solution - use a **business key** i.e. a combination of other attributes which make it unique +- therefore, **do not use the surrogate key for equals** - hibernate already uses it for its **first level of cache** as discussed earlier +- we can use the foreign entity association for equals and hash code - e.g. for the bid entity, the business key can be a combination of item and its amount. this might mean using the business key of the foreign entity association internally +- initial databases had **2 phase locking**, while modern databases have **mvcc** +- **mvcc** - **multi version concurrency control** - with this, the locking is reduced even further, so that - + - readers do not block writers + - writers do not block readers + - multiple writers can however still not access a record +- for this to work, **multiple versions** of the same record need to exist +- some common problems have been discussed now - +- **the lost update** problem - + - transaction one starts to add 10 to our balance + - so, transaction one reads the balance as 100 + - transaction two starts to add 20 to our balance + - so, transaction two also reads the balance as 100 + - transaction one commits 110 to the database + - transaction two commits 120 to the database + - so the final state is 120, which should have ideally been 130, i.e. the update of transaction one is lost +- **unrepeatable read** problem - + - transaction one tries finding current balance and reads 100 + - transaction two comes in, adds 10 and commits changes to database + - transaction one tries finding current balance again and reads 110 this time + - so, transaction one has read different values for the same row during its execution +- **phantom read** problem - + - transaction one tries generating a statement and finds 110 _transactions_ for the month of february + - transaction two comes in, adds 10 and commits changes to database + - transaction one tries generating a statement and finds 111 _transactions_ for the month of february + - so, transaction one has read different result sets for the same query during its execution + - my understanding - basically, it is like unrepeatable read, but instead of just the values, the amount of rows increase or decrease, so its due to insert or delete, unlike unrepeatable read which is due to update +- so, both jpa and sql have **isolation levels** (recall i of acid!). remember - as we increase isolation level, performance degrades. in multi user concurrent systems like today, we might have to sacrifice some amount of isolation for better performance and scalability. just remember the name, the definition will become obvious - + - **read uncommitted isolation** - all problems are allowed + - **read committed isolation** - dirty reads are not permitted + - **repeatable read isolation** - nothing is permitted except phantom reads + - **serializable isolation** - emulates serial execution i.e. transactions are executed one after another and not concurrently. none of the four problems are permitted. this relies on table locks and not just row level locks +- my understanding 😠 - despite what i wrote above, apparently, due to the change in industry standard from 2 phase locking to mvcc, at least in mysql, lost update is not prevented by an isolation level of repeatable read as well. it is prevented by serializable isolation level, which does not use mvcc at all, and uses 2 phase locking!! this is why, we should use `@Version` always, or at least that is what i understand from [this](https://stackoverflow.com/a/53564708/11885333) answer +- jpa uses the isolation level of database connection - most resolve to read committed, but mysql uses repeatable read +- however, recall how persistence context cache is used when we attempt to retrieve the same row twice. this means that while isolation level is read committed, we are effectively using repeatable read +- **optimistic concurrency control** - hibernate supports maintaining **version columns** for us automatically, using which ensures _first commit wins_ in case of parallel transactions. it is easy to use, so probably use it always +- note - use optimistic concurrency control only when it is acceptable to detect conflicts late in a unit of work. concurrent updates should not be a frequent scenario, otherwise a lot of cpu cycles would be wasted i.e. the computation would be performed and then the update would have to be rejected +- to enable versioning, we use `@Version` +- we should not have to set version manually, it should be handled by hibernate for us automatically - if hibernate feels that the entity has changed during dirty checking, it would automatically bump up the version number for us bts +- when updating, instead of the where clause having `where id = ?`, the where clause now has `where id = ? and version = ?` +- we can use int, short, long, and hibernate will wrap again from 0 if the version limit is reached +- `OptimisticLockException` is raised if version is changed by another concurrent transaction +- we might not like the extra version column. hibernate can use the timestamp fields like last modified by to help achieve optimistic locking + ```java + @Version + private LocalDateTime lastUpdatedAt; + ``` +- tip - due to jvms being possibly deployed on different operating systems, the time might not be guaranteed to be accurate / synchronized in all of them (clock skew). so, we can tell hibernate to ask the database for the timestamp. disadvantage - a database hit is required every time, just like when using auto incremented ids +- how i tested if optimistic locking is working in my application - try updating using same version twice - the second update should throw an exception. also note how i disable the transaction on the test method so that this test is executed "out of a transaction". finally, recall how exception would be wrapped by `ServletException` when using mock mvc + ```java + @Test + @SneakyThrows + @Transactional(propagation = Propagation.NOT_SUPPORTED) + void updateBeerByIdOptimisticLockingCheck() { + PersistentBeer beer = beerRepository.findAll().get(0); + BeerDto beerDto = beerMapper.map(beer); + + beerDto.setBeerName("updated beer name"); + mockMvc.perform(put("/api/v1/beer/{beerId}", beerDto.getId()) + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .content(objectMapper.writeValueAsString(beerDto))) + .andExpect(status().isNoContent()); + + beerDto.setBeerName("updated beer name again"); + ServletException e = assertThrows( + ServletException.class, + () -> mockMvc.perform(put("/api/v1/beer/{beerId}", beerDto.getId()) + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .content(objectMapper.writeValueAsString(beerDto))) + .andExpect(status().is5xxServerError()) + ); + assertTrue(e.getMessage().contains("ObjectOptimisticLockingFailureException")); + } + ``` +- **optimistic lock mode** - imagine item to category is many to one. we have many categories and items, and we would like to find the sum of prices for all items for each category. however, when we were iterating through categories, midway through, category for an item was changed, thus making us consider an item into two (or maybe no) categories +- basically, we have the unrepeatable read problem (category_id of item has been updated). note - recall how we discussed that hibernate has default of read committed, and with the help of hibernate persistence context cache, it kind of becomes repeatable read. so, why do we still have the problem? in our case, a result set is returned for every query. so, while hibernate persistence context cache would contain the older version of the item, it would load this item in the result set. yes, the older version of the item is loaded but it can still happen that multiple result sets contain an item / no result sets contain an item +- so, we can set lock mode = optimistic. this way, after performing all the operations (during commit), **for each item that we loaded**, hibernate would rerun a select and match the version column. if it has changed, it would throw the `OptimisticLockException` + ```java + EntityManager em = emf.createEntityManager(); + em.getTransaction().begin(); + + for (Long categoryId : CATEGORIES) { + List items = em.createQuery("select i from Item i where i.category.id = :catId", Item.class) + .setLockMode(LockModeType.OPTIMISTIC) + .setParameter("catId", categoryId) + .getResultList(); + + for (Item item : items) + totalPrice = totalPrice.add(item.getBuyNowPrice()); + } + + em.getTransaction().commit(); + em.close(); + ``` +- my understanding - why do i even need `LockModeType.OPTIMISTIC` if i already added `@Version` - e.g. understand in above example that we actually never modified Items for our query! our problem was that items that we read were modified! by default, jpa will only perform version checking using `@Version` for updates (maybe deletes as well, not sure). here, we want it to perform the version checking for the items we selected as well! so, we use `LockModeType.OPTIMISTIC` +- of course, for `LockModeType.OPTIMISTIC` to work, we need to have a `@Version` column, otherwise what will it check! +- note - i think we can annotate jpa repository methods with `@Lock(LockModeType.OPTIMISTIC)` as well +- disadvantage of lock mode - if we use 100 locks, we will get 100 additional queries for checking the version as described earlier +- i think that the point is that while transaction's isolation applies to the whole unit of work, the lock would apply to particular operations inside that transaction +- **optimistic force increment lock mode** - another problem - e.g. we want to find an item's highest bid. while performing the calculation, someone concurrently added a new bid. so, essentially our highest bid might be wrong. this cannot be caught by adding a version to bid as well +- a trick to solve this - enforce that when the item is read, its version is incremented. this way, when there is a flush, it would be noticed that the item version had changed (because a new bid was added to it) + ```java + Item item = em.find(Item.class, itemId, LockModeType.OPTIMISTIC_FORCE_INCREMENT); + bid.setItem(item); + bidRepo.save(bid); + // saving bid increments item version as well + // even though item did not change (bid has item_id, bid to item is many to one) + ``` +- this is a common operation - forceful increment of a root instance when child data is modified +- another advantage of optimistic force increment lock mode - recall how in optimistic lock mode, the version checking happens and then the transaction is committed. it can happen that during this phase itself, there is an update to the database! this is what optimistic force increment lock mode helps solve - i think because the root item's version needs to be incremented, it needs to be locked, just "reading" the version is not enough +- **pessimistic locks** - optimistic locks (we discussed two of them above) are implemented by jpa / hibernate using the version column, but pessimistic locks take help of the actual underlying database locks +- the difference between optimistic locks and pessimistic locks - + - **optimistic locks use version checks in for e.g. where clause of dml statements**, e.g. update only when version = 2 + - **pessimistic locks use database locks**. they can be shared (read locks are usually shared) or exclusive (e.g. write locks are usually exclusive). e.g. of doing this in mysql etc is `select ... for update`. **the idea is the rows which match the select clause cannot be touched till the lock is released / update is over** +- **pessimistic force increment lock mode** - just the like its optimistic counterpart. the only difference is that here, we increment the version at the beginning of the transaction, and not at the end. we now have a db lock on that record as well till the transaction gets over, so concurrent transactions cannot write to that row. whether they can read or not depends on whether the database uses is 2 phase locking or mvcc. syntax - `LockMode.PESSIMISTIC_FORCE_INCREMENT` +- **pessimistic read** - acquire a read (recall how it is implemented as shared) lock +- **pessimistic write** - acquire a write (recall how it is implemented as exclusive) lock +- so five locks have been discussed - **optimistic**, **optimistic force increment**, **pessimistic force increment**, **pessimistic read**, **pessimistic write** +- deadlocks - deadlocks can happen easily in concurrent applications, e.g. one thread tries updating item 1 and then item 2, while another thread tries updating item 2 then item 1. thread 1 waits for lock 2, thread 2 waits for lock 1. "underlying dbms" have capabilities around realizing this and aborting one of the transactions +- one solution - set `hibernate.order_updates` property to true, so that updates are processed in order by all applications +- spring data jpa has an "implicit transactional context" that kicks in for the repository methods we call if there is no existing transaction. however, when we use for e.g. `@DataJpaTest`, it has its own `@Transactional`. so, behavior of test (using explicit transaction provided by the jpa test) might not be the same as the actual service layer code (using implicit transaction of repositories). so, we should try using explicit transactions as a best practice +- both spring and jakarta have the transactional annotations, i believe either can be used +- we can also use `@Transactional` on repository methods +- because of how spring proxies / aop works, `@Transactional` would not kick in when calling internal methods +- tests - annotate classes with `@DataJpaTest`, it does have its own `@Transactional`. reason for writing jpa tests - e.g. we use jpa's query dsl. while it does have compile time checking, we should assert the functionality of our query +- note - the `@DataJpaTest` annotation wasn't picking up the properties file, where i had configured h2 url and parameters like MODE=MYSQL (otherwise flyway migration scripts were failing). so, i had to add the below based on [this](https://stackoverflow.com/a/57345507/11885333) + ```java + @DataJpaTest + @AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE) + ``` +- **if we annotate our test class with `@Transactional`, it rolls back the transaction at the end of each test method by default**. caveat - remember when using `RANDOM_PORT`, `DEFINED_PORT`, etc. a real servlet environment is used bts. thus, client and server run on different threads. therefore, only client side transaction is rolled back +- if a method in bean 1 calls a method in bean 2, which transaction is the method in bean 2 executed? this is defined via **transaction propagation** - + - **required** - if a transaction exists, the process is continued in that transaction. else, a new transaction is created + - **supports** - if a transaction exists, the process is continued in that transaction. else, no transaction is created + - **mandatory** - if a transaction exists, the process is continued in that transaction. else, `TransactionRequiredException` is thrown + - **requires new** - if a transaction exists, it is suspended and a new transaction is created. else, a new transaction is created + - **not supported** - if a transaction exists, it is suspended. else, no transaction is created + - **never** - if a transaction exists, `IllegalTransactionStateException` is thrown. else, no transaction is created + - **nested** - if a transaction exists, a sub transaction would be created. this means a **save point** is created and then the processes continues. if there is an error in the sub transaction, the changes would be rolled back up to the save point and then continued. if no transaction was present, a new transaction would be created +- optionally, we can specify `rollbackFor` to rollback the transaction for certain exceptions, or `noRollbackFor` to not rollback the transaction for certain exceptions +- inside `@Transactional` apart from propagation, isolation, (rollback for / no rollback for), etc. we can specify - + - **time out** - after this, the transaction will automatically rollback + - **read only** - marking transactions as read only allows jpa to make optimizations. so, remember parameters like this, `@Immutable`, etc +- using `@Transactional` is the **declarative**, preferred approach. we can use an imperative approach via `TransactionTemplate` + ```java + TransactionTemplate transactionTemplate = ...; + transactionTemplate.setIsolationLevel(...); + transactionTemplate.setPropagationBehavior(...); + transactionTemplate.execute((status) -> { + return ""; + }); + ``` +- we can load data by navigating the entity graph - `item.getSeller().getAddress().getCity()` - the focus of the next few points +- **fetch plan** - what to load +- **fetch strategy** - how to load +- **fetch profile** - store the fetch plan and fetch strategy as a fetch profile to reuse it later +- we define the default - **lazy** or **eager** in the domain models mapping +- we should try defaulting to lazy when possible, so that data is loaded on demand +- again, hibernate proxies are used to implement this functionality for us +- if for e.g. our entity is in detached state, we might get a `LazyInitializationException` when trying to access the lazily loaded fields +- my understanding - e.g. we want to find the size of a collection in one to many. if we run `item.getBids().size()`, i think the entire collection would be loaded due to the proxy nature. we can instead use `Hibernate.size(item.getBids())` to avoid this full query. this way, only the `count(*)` query would be run, and the `item.getBids()` still remains uninitialized. similarly, we have `Hibernate.contains` etc +- issues - + - lazy loading leads to **n + 1 selects problem** + - eager loading can lead to **cartesian product problem** +- we should avoid both extremes, and try finding a middle ground between both +- n + 1 selects problem - 1 query for fetching all items, then n queries for each item's seller + ```java + List items = em.createQuery("select i from Item i").getResultList(); + for (Item item : items) { + assertNotNull(item.getSeller.getUsername()); + } + ``` +- cartesian product problem - when we try eager loading of two collections with one sql query. e.g. an item has 3 images and 3 bids. it would result in an sql table with 9 rows. while it is automatically deduped for us if we use `Set`, this is not a desirable outcome, since a lot of duplicated rows are sent across the network from database to application. it is more performant to break the query into smaller individual parts +- apart from the above problem, we can have a lot of nested eager fetch statements, e.g. item has bids, which can have seller, which can have address and so on. hibernate has a `hibernate.max_fetch_depth` property. my understanding - after this depth is reached, hibernate will start issuing individual select statements like in lazy loading. by default, there is no preset limit for this property, while sql dialects like mysql set it to 2 by default +- **batch size** is one possible solution for n + 1 selects query problem. we annotate the User entity with `@BatchSize` like below - + ```java + @Entity + @BatchSize(size = 10) + public class User { + } + ``` +- refer the item example above, where each `item.getSeller().getUsername()` was resulting in a separate db call. with the current method, there would be a call like below - 10 user proxies would be initialized in one go - + ```sql + select * from users where id in (?, ?, ...) + ``` +- apparently, hibernate is more optimized then i thought it is! it will internally create several batch loaders, which i assume _hopefully_ run in parallel, i.e. if i specify batch size to be 32, and i have to load 31 items, there would be three fetches of sizes 16, 10 and 5, instead of one big fetch of 32. this behavior is configurable via `batch_fetch_style` +- the `BatchSize` argument can also be set on collections - + ```java + @BatchSize(size = 10) + private Set bids = new HashSet<>(); + ``` +- **fetch mode - subselect** is another solution for n + 1 selects query problem. we annotate with `@Fetch` like below - + ```java + @Fetch(FetchMode.SUBSELECT) + private Set bids = new HashSet<>(); + ``` +- refer the item example above, where each `item.getSeller().getUsername()` was resulting in a separate db call. with the current method, there would be a call like below - fetch all users for all items in one go - + ```sql + select * from bid where item_id in ( + select id from item where id in (?, ?, ...) + ) + ``` +- of course, such optimizations are restricted to a persistence context, because after that, probably hibernate discards the entities it stores in memory, and they are garbage collected +- **fetch mode - select** is a solution for the cartesian product problem. we annotate with `@Fetch` like below - + ```java + @Fetch(FetchMode.SELECT) + private Set bids = new HashSet<>(); + + @Fetch(FetchMode.SELECT) + private Set images = new HashSet<>(); + ``` +- with the current method, there would be separate calls for bids and images +- now cartesian product of course happens when setting fetch type as eager. since it is a global setting, it is not a reommended approach. the best approach is to dynamically fetch eagerly as and when needed +- dynamic eager fetching in jpql - `select i from Item i left join fetch i.bids` +- same support is present in criteria builder as well (not discussed) +- **fetch profiles** - global metadata, so while we can place it on a class, the best place for them is inside package-info.java + ```java + @FetchProfiles({ + @FetchProfile( + name = "fetch_bids", + fetchOverrides = @FetchProfile.FetchOverride( + entity = Item.class, + association = "bids", + mode = FetchMode.JOIN + ) + ), + @FetchProfile( + name = "fetch_images", + fetchOverrides = @FetchProfile.FetchOverride( + entity = Image.class, + association = "images", + mode = FetchMode.JOIN + ) + ) + }) + ``` +- since fetch profile is a hibernate specific feature, entity manager by itself is not enough for it. this technique of using unwrap to obtain a hibernate session from jpa entity manager is common - + ```java + em.unwrap(Session.class).enableFetchProfile("fetch_bids"); + Item item = em.find(Item.class, 123); + ``` +- jpa also has **entity graphs** for similar functionality +- **filtering data** - examples - + - when data is read from database by hibernate, restrict some data + - when data is written to database by hibernate, add some audit logs +- we can execute **side effects** using **event listeners**, which help hook into the lifecycle of hibernate +- `@PostPersist` - invoked after the entity is stored inside the database +- we can anotate any method with this, the class need not extend any special interface etc +- we can use the argument as `Object` to capture for all entities, or specify the type of the entity to capture it only for specific entities + ```java + public class PersistEntityListener { + + @PostPersist + public void logMessage(Object entityInstance) { + User currentUser = CurrentUser.INSTANCE.get(); + log.save("Entity instance persisted by " + + currentUser.getUsername() + + ": " + + entityInstance + ); + } + } + ``` +- we have many more annotations like `@PostPersist` for different points in the lifecycle +- for the entity listener above to work, the entity must be annotated with the right listeners - + ```java + @EntityListeners(PersistEntityListener.class) + @Entity + public class Item { + // ... + ``` +- we can also place it directly inside the entity itself, in which case the method will not have any arguments - we would use `this` instead + ```java + @Entity + public class User { + // ... + + + @PostPersist + public void logMessage() { + User currentUser = CurrentUser.INSTANCE.get(); + log.save("Entity instance persisted by " + + currentUser.getUsername() + + ": " + + this + ); + } + } + ``` +- this was all jpa i.e. annotations like `@PostPersist`, `@PreRemove`, etc. hibernate has an even more powerful api - **hibernate interceptors** (skipping for now since code is a bit more involved) +- envers - helps maintain multiple versions of the data +- we need to annotate entity we would like to audit using `@Audited`, and the properties we would like to skip for auditing using `@NotAudited` +- whenever we modify the data in some way, a new record is inserted in the **revinfo** table. this contains a **primary key (rev)** and a **timestamp**. use of timestamp - "give me a list of items as they were on last friday" +- now, each audited table will have a corresponding **foreign key (rev)** pointing to the revinfo table, and a **revtype** column which indicates whether the item was inserted, updated or deleted
+ ![envers](/assets/img/spring/envers.png) + +## Spring Security + +- security is a non functional requirement i.e. it isn't a part of business concerns, but it is critical +- includes https, firewalls, and application security (the focus of spring security) +- when we add the spring security dependencies, we get a session based authenticated app by default, where the default user name is user and the password is printed in console +- why spring security - + - supports a lot of different mechanisms like basic username / password authentication, oauth, jwt, etc + - supports lot of features like path or method level security with authorization etc +- recall flow - user <-> servlet container <-> filters <-> dispatcher servlet <-> controller handler +- **spring security** adds a lot of its own **filters** as well +- spring security architecture - + - user sends their details + - spring security filters will populate the "authentication object" with the user auth details - in spring security, this "authentication object" is the standard responsible to hold details related to current user + - then, this "authentication object" is forwarded to "authentication manager" + - the "authentication manager" talks to different "authentication providers". it tries all the "authentication providers" our application has configured, and selects the one that is successful + - the "authentication provider" takes the "authentication object" populated with credentials as input, and returns the "authentication object" populated with principal, authorities, etc as output + - we can have different "authentication provider"s - like ldap, oauth, username and password, etc + - "authentication providers" can take help of classes like - + - "user details service" / "user details manager" (which can retrieve users from the given principal) + - note how the communication between "user details service" and "authentication provider" is using "user details" object, and not "authentication object" like the rest of the flow + - "password encoder" + - finally, the authentication object is stored in the "security context" +- diagram -
+ ![spring security architecture](/assets/img/spring/spring-security-architecture.drawio.png) +- some concrete implementations of classes discussed above - no need to remember these, this just validates our understanding of the above diagram + - `UsernamePasswordAuthenticationToken` is an implementation of the `Authentication` object + - `ProviderManager` is an implementation of `AuthenticationManager` + - `DaoAuthenticationProvider` is an implementation of `AuthenticationProvider` + - `InMemoryUserDetailsManager` is an implementation of `UserDetailsManager` + - `User` is an implementation of `UserDetails` +- by default, the following `SecurityFilterChain` is configured for us, visible inside `SpringBootWebSecurityConfiguration` + ```java + @Bean + @Order(SecurityProperties.BASIC_AUTH_ORDER) + SecurityFilterChain defaultSecurityFilterChain(HttpSecurity http) throws Exception { + http.authorizeHttpRequests((requests) -> requests.anyRequest().authenticated()); + http.formLogin(withDefaults()); + http.httpBasic(withDefaults()); + return http.build(); + } + ``` +- this says - + - any request should be authenticated + - for ui as in when hitting endpoints from browser, show the basic form + - when hitting endpoints from postman etc, use basic authentication +- when we specify our own `SecurityFilterChain`, this bean would not be used +- for e.g. protecting all paths except some - + ```java + @Bean + public SecurityFilterChain securityFilterChain(HttpSecurity http) throws Exception { + http.authorizeHttpRequests((requests) -> requests + .requestMatchers("/notices", "/contact").permitAll() + .requestMatchers("/**").authenticated() + ); + http.formLogin(Customizer.withDefaults()); + http.httpBasic(Customizer.withDefaults()); + return http.build(); + } + ``` +- recall how authentication providers use `UserDetailsManager`. there are multiple implementations of `UserDetailsManager` like - + - `InMemoryUserDetailsManager` + - `JdbcUserDetailsManager` + - `LdapUserDetailsManager` +- all the `UserDetailsManager` implementations we discussed deal with the `UserDetails` object, which has functionality for getting authorities, username, password, etc +- recall we discussed that we use `Authentication` for communication between spring security classes. so, since the `UserDetailsManager` deals with `UserDetails`, the `AuthenticationProvider` converts the `UserDetails` object into `Authentication` object +- one of the `UserDetailsManager` implementations is `JdbcUserDetailsManager`. it expects tables to be present in a certain way e.g. tables for users, groups, authorities, etc. e.g. [refer the ddl here](https://docs.spring.io/spring-security/reference/servlet/authentication/passwords/jdbc.html) +- then, after ensuring the database has these tables, we can add a few records to the users and authorities tables +- then, we just add spring-data-jpa and correct driver for the database connection to the dependencies +- finally add the bean below - + ```java + @Bean + public UserDetailsManager userDetailsManager(DataSource dataSource) { + return new JdbcUserDetailsManager(dataSource); + } + ``` +- what if JdbcUserDetailsManager is not good for us due to the schema rigidity, and we want something custom, we can implement our own `UserDetailsService`. what is `UserDetailsService` 😫 - it is `UserDetailsManager` with only `loadByUsername`. our goal is to map the user representation in our system (customer in this case) that our data source understands to `UserDetails` object, which is implemented by `User` + ```java + @Bean + public UserDetailsService userDetailsService() { + return (username) -> customerDao.findByEmail(username) + .map(customer -> new User( + customer.getEmail(), // username + customer.getPassword(), // password + List.of(new SimpleGrantedAuthority(customer.getRole())) // authorities + )) + .orElseThrow(() -> new UsernameNotFoundException("customer with email " + username + " not found")); + } + + @Bean + public PasswordEncoder passwordEncoder() { + return new BCryptPasswordEncoder(); + } + ``` +- notice how with so less lines of code, we have a custom authentication + authorization built! - all we did was + - specify the `UserDetailsManager` slice to use via `UserDetailsService` + - the password encoder to use + - authenticate endpoints using a bean of `SecurityFilterChain` +- why did we not have to do any password validation? because `AuthenticationProvider` (concrete implementation is `DaoAuthenticationProvider`) does it for us automatically based on the password encoder we configure! remember, we configured user details manager, not authentication provider +- password encoder - + - encoding - e.g. base64. an algorithm is used to encode. this doesn't involve any secret. we can usually use decoding to retrieve the actual value. so, it is not ideal for passwords + - encryption - a secret key is used, so it is more secure than encoding. however, we can still use decryption to get back the original value, if the secret is leaked + - hashing (1 way) - e.g. bcrypt. use a function to obtain a hash value. it is not reversible, so it is very secure. to validate, we pass the input and **match it** with the stored hashed value. now what does match it actually mean - + - every time the hash is generated for the **same input**, the output is different! this way, if two users have the same password, the same representation is **not** stored inside the database, thus making it even more secure. the hashing algorithm knows if the raw input **matches** the stored hash value +- since i used the bcrypt password encoder, the stored value looks like this - `$2a$10$aj6zt3F9zLr9U39kwVUCxusnd.DvqakuP9/lxp8n8yFHnKrOvIuIK`. here, the beginning i.e. $2a gives the version of bcrypt used, and after that, $10 gives the number of rounds used +- for brcypt (or generally any hashing algorithm?) we can configure - + - **strength** + - **number of rounds** + - **salt** +- a simple registration process based on the `UserDetailsService` and `AuthenticationProvider` we configured above - + ```java + @PostMapping("/register") + @ResponseStatus(HttpStatus.CREATED) + public void registerUser(@RequestBody PersistentCustomer customer) { + customerDao.findByEmail(customer.getEmail()).ifPresent((existing) -> { + throw new RuntimeException("customer with email " + existing.getEmail() + " already exists"); + }); + customer.setPassword(passwordEncoder.encode(customer.getPassword())); + customerDao.save(customer); + } + ``` +- if we wanted more customization, **instead of** providing implementation of `UserDetailsManager` via `UserDetailsService#loadByUsername`, we can provide a bean of `AuthenticationProvider` +- understand how based on flow diagram we saw, unlike returning `UserDetails` object via concrete implementation `User`, we now have to return `Authentication` object via concrete implementation `UsernamePasswordAuthenticationToken` + ```java + @Component + @RequiredArgsConstructor + public class CustomAuthenticationProvider implements AuthenticationProvider { + + private final CustomerDao customerDao; + + private final PasswordEncoder passwordEncoder; + + @Override + public Authentication authenticate(Authentication authentication) throws AuthenticationException { + PersistentCustomer customer = customerDao.findByEmail(authentication.getName()) + .orElseThrow(() -> new BadCredentialsException("customer with email " + authentication.getName() + " does not exist")); + if (!passwordEncoder.matches(authentication.getCredentials().toString(), customer.getPassword())) { + throw new BadCredentialsException("passwords do not match for customer with email " + authentication.getName()); + } + return new UsernamePasswordAuthenticationToken( + customer.getEmail(), + customer.getPassword(), + List.of(new SimpleGrantedAuthority(customer.getRole())) + ); + } + + @Override + public boolean supports(Class authentication) { + return (UsernamePasswordAuthenticationToken.class.isAssignableFrom(authentication)); + } + } + ``` +- cors - cross origin resource sharing +- origin = protocol (http) + domain + port +- communication is stopped across origins **by browsers** to prevent security issues +- so, for e.g. a different website cannot use our api unless our apis allow this website's domain explicitly +- browsers make a **preflight request** - the request is made by the browser, to which the backend responds with what methods and endpoints are allowed +- we can either configure cors using `@CrossOrigin(domain)` on a per controller basis (usually not ideal), or use the below - + ```java + // configure the SecurityFilterChain bean like so + http.cors(Customizer.withDefaults()); + + @Bean + public CorsConfigurationSource corsConfigurationSource() { + CorsConfiguration configuration = new CorsConfiguration(); + configuration.setAllowedOrigins(List.of("http://localhost:4200/")); + configuration.setAllowedMethods(List.of("*")); + configuration.setAllowedHeaders(List.of("*")); + configuration.setAllowCredentials(true); + UrlBasedCorsConfigurationSource source = new UrlBasedCorsConfigurationSource(); + source.registerCorsConfiguration("/**", configuration); + return source; + } + ``` +- something i didn't know - for e.g. recall the action method on forms? from my understanding, this is not protected by cors, i.e. if a website evil.com has its action set to netflix.com, even if netflix configures cors correctly, this form action would go through! this concept is important in csrf discussed below +- also my understanding of where csrf might be important - cors depends on browser the client uses, what if the client uses a browser that does not have cors functionality? +- csrf - security vulnerability (unlike cors, which is a guard rail provided by browsers) +- csrf - cross site request forgery +- example - + - we log into netflix.com, and netflix stores a cookie in our browser - recall how cookies are scoped to a domain + - assume we click on a malicious link, which actually makes a put api call to netflix.com, to for e.g. change the password of the current user + - since netflix had already stored a cookie in our browser, the request goes through, and netflix thinks it is a request from a legitimate user, and the password of our account is changed easily! +- solution - a **secure random csrf token** is generated, which is **unique per session** +- so, assume with csrf implemented correctly, our ui receives a csrf token inside a cookie / response header, etc along with a separate cookie for authentication +- for further requests, we forward this csrf token inside the request header / request body along with the authentication cookie. do not send csrf token as a cookie, since then we are back to the same problem as authentication cookie! we can receive the csrf token as a cookie, but then we need to parse it and send it as a request body / header. this parsing cannot be done by evil.com, since it is a different domain, so it does not have access to cookies +- disabling csrf - `http.csrf(csrf -> csrf.disable());` / `http.csrf(AbstractHttpConfigurer::disable);` +- configuring csrf correctly - we can use `CookieCsrfTokenRepository`, which writes the csrf token to a cookie named `XSRF-TOKEN` and reads it from an http request header named `X-XSRF-TOKEN` or the request parameter `_csrf` +- [this documentation](https://docs.spring.io/spring-security/reference/servlet/exploits/csrf.html) seems to have a good explanation for csrf, skipping for now +- my doubt - if we for e.g. send jwt not as a cookie but as a header, wouldn't we automatically be protected by csrf? because the malicious website cannot "parse" or "access" the jwt, just like it cannot access or parse the csrf cookie +- authentication error - 401, authorization error - 403 +- authentication happens before authorization +- authorities are stored via interface `GrantedAuthority` and concrete implementation `SimpleGrantedAuthority` +- these authorities are available on both `UserDetails` (used between `UserDetailsManager` and `AuthenticationProvider`) and `Authentication` object (used between `AuthenticationProvider` and `AuthenticationManager`) +- code example - + ```java + http.authorizeHttpRequests((requests) -> requests + .requestMatchers("/myAccount").hasAuthority("view_account") + .requestMatchers("/myBalance").hasAnyAuthority("view_account", "view_balance") + .requestMatchers("/user").authenticated() + .requestMatchers("/contact").permitAll() + ); + ``` +- like authority, we have hasRole and hasAnyRole as well +- my understanding - spring requires that roles have the `ROLE_` prefix + - so when using hasRole etc, do not specify the `ROLE_` prefix + ```java + .requestMatchers("/myBalance").hasAnyRole("user", "admin") + .requestMatchers("/myLoans").hasRole("user") + ``` + - either save to the database with the `ROLE_` prefix, or when mapping to `GrantedAuthority` inside `UserDetailsService`, add the `ROLE_` prefix (internally, our schema stores one to many for `PersistentCustomer` and `PersistentAuthority`) + ```java + @Entity + @Data + @AllArgsConstructor + @NoArgsConstructor + @Table(name = "authorities") + public class PersistentAuthority { + + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Integer id; + + private String name; + + @ManyToOne + @JoinColumn(name = "customer_id") + private PersistentCustomer customer; + + public GrantedAuthority map() { + return new SimpleGrantedAuthority("ROLE_" + name); + } + } + ``` +- authority - individual actions like "view account", "view balance", etc +- role - group of authorities +- one practice used at my firm - + - think of privilege as action + resource combination - "view balance", "view card", etc - these map to authorities + - different roles have different authorities - admins and ops can have "edit card", all users will have "view account" etc + - allow assigning multiple roles to users +- **filters** - we can write our own filters and inject them into the spring security flow +- **filter chain** - represents a collection of filters which have to be executed in a defined order +- so, on `HttpSecurity http`, we can call `http.addFilterBefore`, `http.addFilterAfter` and `http.addFilterAt` + ```java + @Slf4j + public class UserLoggingFilter implements Filter { + + @Override + public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse, FilterChain filterChain) throws IOException, ServletException { + + // typically this typecasting might be needed, not used here though + HttpServletRequest request = (HttpServletRequest) servletRequest; + HttpServletResponse response = (HttpServletResponse) servletResponse; + + Authentication authentication = SecurityContextHolder.getContext().getAuthentication(); + if (authentication != null) { + log.info("user {} with authorities {} has logged in", authentication.getName(), authentication.getAuthorities()); + } + + filterChain.doFilter(servletRequest, servletResponse); + } + } + + http.addFilterAfter(new UserLoggingFilter(), BasicAuthenticationFilter.class); + ``` +- we implemented `Filter` above. we can **instead** use - + - `GenericFilterBean` - has access to a lot of other things like context, environment, etc + - `OncePerRequestFilter` - to ensure that the filter is executed only once, even if it is invoked multiple times by the underlying logic +- tokens - when the clients login successfully, they are returned a token from the backend. the clients should then attach this token to every request to access protected resources +- advantage of using tokens + - we do not share our credentials for every request every time like in for e.g. basic auth, we just pass around the token every time + - if tokens are compromised we can easily regenerate them. credentials cannot be changed easily for every user + - tokens can have a expiry attached to them, post which they have to be regenerated + - tokens allow storing of other user related information like name, email, roles, etc. this way, the backend can simply use these without every time for e.g. "fetching" this information + - we can reuse tokens for different kinds of applications like maps, email, etc + - statelessness - for horizontally scaled applications since it doesn't need sessions +- jwt tokens - they have the format `<
>.<>.<>` +- header - metadata like algorithm used for generating token, e.g. hs256 (stands for hmacsha256?). it is in base64 encoded format +- payload - name, email, roles, who issued the token, expiry, etc. it is also in base64 encoded format +- e.g. someone can easily decode the payload using base64 and add a role to it and encode it back again using base64. solution - signature +- signature - a digital signature for tokens. it helps ensure that the token has not been tampered +- the algorithm in header is used to generate this signature - `hmacsha256(base64(header) + '.' + base64(payload), secret)`. the secret here is only known to the backend +- on receiving the token, the backend can recompute the signature using the provided header and payload. if the signatures do not match, the backend can conclude that the token is invalid +- try to compare how jwt matches all the advantages we had mentioned for using tokens +- add [these](https://github.com/jwtk/jjwt#maven) maven dependencies - + ```xml + + io.jsonwebtoken + jjwt-api + ${jjwt.version} + + + + io.jsonwebtoken + jjwt-impl + ${jjwt.version} + runtime + + + + io.jsonwebtoken + jjwt-jackson + ${jjwt.version} + runtime + + ``` +- disable spring security's session creation + ```java + http.sessionManagement(session -> session.sessionCreationPolicy(SessionCreationPolicy.STATELESS)); + ``` +- we generate the jwt using `OncePerRequestFilter`. notes + - we should do this when we can be sure that the authentication is successful, so we use `addFilterAfter` + - using `shouldNotFilter`, we ensure that this token is generated only when the user logs in, which happens using the /user path + + ```java + // secret can come from application.properties + http.addFilterAfter(new JWTTokenGeneratorFilter(secret), BasicAuthenticationFilter.class); + + @RequiredArgsConstructor + public class JWTTokenGeneratorFilter extends OncePerRequestFilter { + + private final String secret; + + @Override + protected void doFilterInternal(HttpServletRequest request, HttpServletResponse response, FilterChain filterChain) throws ServletException, IOException { + + Authentication authentication = SecurityContextHolder.getContext().getAuthentication(); + + if (authentication != null) { + + SecretKey key = Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8)); + + String serializedAuthorities = authentication + .getAuthorities() + .stream() + .map(GrantedAuthority::getAuthority) + .collect(Collectors.joining(",")); + + String jwt = Jwts.builder() + .claim("username", authentication.getName()) + .claim("authorities", serializedAuthorities) + .issuedAt(new Date()) + .expiration(new Date(new Date().getTime() + (24 * 60 * 60 * 1000))) + .signWith(key) + .compact(); + + response.setHeader(HttpHeaders.AUTHORIZATION, jwt); + } + } + + @Override + protected boolean shouldNotFilter(HttpServletRequest request) throws ServletException { + return !request.getServletPath().equals("/user"); + } + } + ``` +- verifying the token - this time, we use `addFilterBefore` and also invert the condition inside `shouldNotFilter` + ```java + http.addFilterBefore(new JWTTokenValidatorFilter(secret), BasicAuthenticationFilter.class); + + @RequiredArgsConstructor + public class JWTTokenValidatorFilter extends OncePerRequestFilter { + + private final String secret; + + @Override + protected void doFilterInternal(HttpServletRequest request, HttpServletResponse response, FilterChain filterChain) throws ServletException, IOException { + + String jwt = request.getHeader(HttpHeaders.AUTHORIZATION); + + if (jwt != null) { + try { + SecretKey key = Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8)); + + Claims payload = Jwts.parser() + .verifyWith(key) + .build() + .parseSignedClaims(jwt) + .getPayload(); + + Authentication authentication = new UsernamePasswordAuthenticationToken( + payload.get("username"), + null, + AuthorityUtils.commaSeparatedStringToAuthorityList(payload.get("authorities", String.class)) + ); + + SecurityContextHolder.getContext().setAuthentication(authentication); + } catch (Exception e) { + throw new BadCredentialsException("invalid token received"); + } + } + } + + @Override + protected boolean shouldNotFilter(HttpServletRequest request) throws ServletException { + return request.getServletPath().equals("/user"); + } + } + ``` +- method level security - add `@EnableMethodSecurity` on any `@Configuration` / `@SpringBootApplication` class + ```java + @Configuration + @RequiredArgsConstructor + @EnableMethodSecurity + public class SecurityConfig { + ``` +- in the pre and post annotations, we can also use spel (spring expression language) +- `@PreAuthorize` - decide if a user is authorized to call a method before actually invoking the method + ```java + @PreAuthorize("hasAnyRole('user', 'admin')") + @PreAuthorize("hasAuthority('view_details')") + @PreAuthorize("#username == authentication.principal.username") + public void preAuthorizeExample(String username) { + } + ``` +- for complex requirements - we can call custom methods, methods on beans, etc afaik from inside these annotations. then we can for e.g. pass the authentication object from inside the annotation to these methods as well +- `@PostAuthorize` - would not stop the method from being executed, but would run after the invocation +- spring aop is used for implementing these annotations bts +- `@PreFilter` and `@PostFilter` - works on objects of type collections. helps filter inputs / outputs. i don't see its use case as of now diff --git a/_posts/2023-07-23-relational-databases.md b/_posts/2023-07-23-relational-databases.md new file mode 100644 index 0000000..447ab40 --- /dev/null +++ b/_posts/2023-07-23-relational-databases.md @@ -0,0 +1,565 @@ +--- +title: Relational Databases +--- + +## Downsides of File Based Systems + +- data redundancy - data repeated at different places +- data inconsistency - data update at one place might not be reflected at another place +- difficult data access - searching through records can be difficult +- security problems - granular control to allow access to databases +- difficult concurrent access - erroneous updates if people try editing files simultaneously, file locks allow only one person to edit files at a time +- integrity constraints - we can't enforce constraints like ensuring a specific data type for an attribute +- databases backup and recovery features are less efficient + +## Entity Relationship Data Model + +- er model is a high-level conceptual data model +- they are used in documentations via er diagrams +- entity - an object like a particular employee or project e.g. an employee jack +- entity type - type of the entity e.g. Employee +- entity set - group of all entities (not entity types) +- attribute - an entity has attributes like age, name +- an entity type is represented as a rectangle +- an attribute is represented as an oval. it can be of following types - + - simple attribute + - composite attribute - composed of multiple attributes e.g. name from first name and last name. it is represented as a tree of ovals + - multivalued attribute - can take an array of values e.g. phone number. the oval has a double outline + - derived attribute - calculated from other attributes e.g. age from birthdate. the oval has a dotted outline +- key attribute - has a value which is distinct for each entity, also called primary key e.g. ssn (social security number) of an employee. represented by an underline on the attribute +- composite key - multiple keys combine to uniquely identify an entity. e.g. vin (vehicle identification number) using state and a number. represent as a composite attribute and underline the key attribute as well +- natural key - use an attribute to uniquely identify an entity. e.g. isbn of book +- relationship - an association between two entities e.g. jack works on project xyz +- relationship type - type of relation e.g. works_on +- relationship set - group of all relationships (not relationship types), just like entity set +- a relationship type is represented as a diamond +- degree - defined on a relationship type, it represents the number of participating entities. it can be of the following types - + - **unary** (recursive) - an entity type is linked to itself, e.g. an employee supervises another employee + - **binary** - two entity types are linked, e.g. employee works on a project + - **ternary** - three entity types are linked, e.g. supplier supplies parts to project +- binary relationship constraints - + - cardinality - represent by writing 1 / N on the arrow + - **one to one** - an entity in set a can be associated to at most one entity in set b and vice versa as well e.g. an employee manages a department + - **one to many** - an entity in set a can be associated to many entities in set b but an entity in set b can be associated to at most one entity in set a e.g. employees are a part of a department + - **many to many** - an entity in set a can be associated to many entities in set b and vice versa e.g. employees work on a project + - participation - + - **total participation** - each entity must participate at least once in the relation, e.g. in employees working on a project, a project has total participation, represented as a double line + - **partial participation** - an entity need not participate in the relation, e.g. in employees working on a project, an employee has partial participation (e.g. hr), represented as a single line +- attributes on relation types - unless cardinality is many to many, since a table is created for many to many, we should try and move attributes of relationships to one of the tables +- weak entity - they cannot exist independently e.g. a course cannot exist without a program. they don't have key attributes (look above) of their own. they are identified via their owner or identifying entity type, and the relation between the weak and identifying entity is called identifying relationship. the attribute which helps in differentiating between the different weak entities of an identifying entity is called a **partial key**. e.g. dependents of an employee. weak entity is represented as a double line for the rectangle and identifying relationship is represented as a double line for the diamond. partial key is represented as a dotted underline. weak entity should of course, have a total participation +- strong entity - have their own key attributes + +## ER Diagram Example + +- entities - + - students have a name, a student identifier, one or more contact numbers + - programs have a name, a program identifier + - courses have a name, a course identifier +- relationships - + - student takes up one or more courses + - student must enroll in a program + - program contains courses + +![er diagram example](/assets/img/relational-databases/er-diagram-example.drawio.png) + +## Relational Model + +- relation - collection of related data, represented as a table +- tuple - also called records, represented as a row, an instance of the type of object stored in the table +- attribute - represented as a column, describe the record +- relation schema - relation name with its attributes' names e.g. employee(id, name, phone number) +- database schema - combination of all relation schemas +- database instance - information stored in a database at a particular time +- domain - set of acceptable values an attribute can contain +- in a relation, sequence of rows and columns are insignificant +- keys - we need keys to fetch tuples easily and to establish a connection across relations +- different types of keys are - + - **super key** - set of attributes that can uniquely identify any row. super key is like a power set. e.g. in employee, (id), (phone), (id, name), (name, phone), (id, phone), (id, name, phone) are all super keys + - **candidate key** - minimal set of attributes that can uniquely identify any row e.g. id, phone number. (id, name) is not a candidate key as id itself can uniquely identify any row + - **primary key** - one out of all the candidate keys is chosen as the primary key e.g. id of employee + - **composite key** - candidate keys that have two or more attributes e.g. vehicle(state, number) + - **alternate key** - any candidate key not selected as the primary key + - **foreign key** - the primary key of a relation when used in another relation is called a foreign key. it helps in connecting the two relations, the referencing and referenced relation +- **integrity constraints** - to maintain the integrity of database i.e. maintain quality of information as crud keeps happening, following rules are present - + - **domain constraint** - each value of an attribute must be within the domain + - **entity constraint** - all relations must have primary key, it cannot be null + - **referential constraint** - foreign key must either reference a valid tuple or be null + - **key constraint** - primary key must be unique +- common relational database operations - crud i.e. create, read, update, delete + +## Functional Dependency + +- X ➔ Y means given X, we can determine Y e.g. in student(id, name), id ➔ name but reverse is not true +- X is called **determinant** while Y is called **dependent** +- **armstrong's axioms** are a set of inference rules to determine all functional dependencies + - axiom of reflexivity - if Y ⊆ X, then X ➔ Y + - axiom of augmentation - if X ➔ Y, then XZ ➔ YZ + - axiom of transitivity - if X ➔ Y and Y ➔ Z, then if X ➔ Z +- prime attribute - a part of any candidate key +- partial dependency - when a non-prime attribute is dependent on a prime attribute +- transitive dependency - when a non-prime attribute is dependent on another non-prime attribute + +## Normalization + +- normalization helps in determining the level of redundancy in a database and providing fixes for them +- there are six normal forms, but only 1nf, 2nf, 3nf and bcnf have been discussed +- sometimes, we do not normalize our database entirely. it not only improves performance for analytics, but if data is duplicated, it works like a double check, thus **reducing chances of corrupt data** + +### First Normal Form + +for being in first normal form or 1nf, relation shouldn't have a multivalued attribute. e.g. + +| id | name | phone | +|-----|------|------------------------| +| 1 | jack | 8745784547, 6587784512 | +| 2 | jane | 3412478452 | + +should be converted to + +| id | name | phone | +|-----|------|------------| +| 1 | jack | 8745784547 | +| 1 | jack | 6587784512 | +| 2 | jane | 3412478452 | + +### Second Normal Form + +for being in second normal form or 2nf, relation should be in 1nf and shouldn't have partial dependencies. e.g. + +| student_id | course_id | course_fee | +|------------|-----------|------------| +| 1 | 1 | 120 | +| 2 | 2 | 150 | +| 1 | 2 | 150 | + +this has partial dependency course_id ➔ course_fee since primary key is (student_id, course_id). +so, it should be split into two tables + +| student_id | course_id | +|------------|-----------| +| 1 | 1 | +| 2 | 2 | +| 1 | 2 | + +| course_id | course_fee | +|-----------|------------| +| 1 | 120 | +| 2 | 150 | + +note how this also reduced data redundancy by storing the course_fee values only once + +### Third Normal Form + +for being in third normal form or 3nf, relation should be in 2nf and shouldn't have transitive dependencies. e.g. + +| student_id | country | capital | +|------------|---------|-----------| +| 1 | india | delhi | +| 2 | nepal | kathmandu | +| 3 | nepal | kathmandu | + +this has transitive dependency country ➔ capital since the capital can be derived from country, and the primary key is student_id. so, it should be split into + +| student_id | country | +|------------|---------| +| 1 | india | +| 2 | nepal | +| 3 | nepal | + +| country | capital | +|---------|-----------| +| india | delhi | +| nepal | kathmandu | + +### Boyce Codd Normal Form + +- for being in boyce-codd normal form or bcnf, relation should be in 3nf and a dependency A ➔ B is allowed only if A is a super key, doesn't matter what B is which make sense, as super keys should be able to find everything. so to check for bcnf, only check if lhs of dependency is super key or not +- e.g. - AB ➔ C and C ➔ B. candidate keys are AB and AC. neither of the dependencies are partial or transitive, so it is in 3nf already. however, C is not a super key, yet we have C ➔ B. so, it is not in bcnf +- my understanding - for bcnf, split into two tables - AC (AC is candidate key) and BC (C is candidate key) +- basically, since prime ➔ non-prime was covered in 2nf, non-prime ➔ non-prime was covered in 3nf, we wanted to remove (prime / non-prime) ➔ prime in bcnf + +## About SQL + +- sql is a standard that has been adopted by various vendors for their implementations. the implementations include db2 by ibm, oracle rdbms by oracle, sql server by microsoft, postgresql and mysql which are opensource, etc. this blog is about mysql implementations of concepts, so things can be different for other distributions +- application / client layer - helps in client connections, authentication and authorization +- server layer - it parses, analyzes and optimizes queries. it also maintains cache and buffers. it makes an execution plan which gets fed into the storage engine layer +- storage engine layer - this layer actually writes and retrieves data from the underlying physical storage. mysql supports different storage engine layers like InnoDB, MyISAM, etc. which we can view by `show engines`. InnoDB is the default. e.g. the way transactions are carried out in them can be different + +## Database Commands + +- `show databases` - list all the database. it would only show the databases that we are authorized to view +- `use database_name` - selecting the database with name database_name. future queries would be performed on the selected database +- `show create database mysql` - shows the command using which the database was created +- `show tables` - display the tables in the current database +- `create database if not exists movie_industry` - create the database if it doesn't exist +- `drop database if exists movie_industry` - drop the database if it exists + +## Table Commands + +- we have a lot of data types in mysql, look [here](https://dev.mysql.com/doc/refman/8.0/en/data-types.html), categorized into numeric data types, date and time data types, string data types, spatial data types, json data type. e.g. numeric data type can have int, bigint, tinyint, decimal +- `describe user` - describe the structure of a table +- `show create table user` - shows the command using which the table was created +- we can provide a constraint for non-nullable fields using `not null` +- we can provide a default value using `default` +- we can automatically assign the next integer using `auto_increment`. auto increment has a few restrictions - + - there can be only one column in a table marked as auto increment + - the auto increment column should be indexed + - the auto increment column cannot have a default value +- create table example - + ```sql + create table if not exists actors ( + id int auto_increment, + first_name varchar(20) not null, + second_name varchar(20) not null, + dob date not null, + gender enum("male", "female", "other") not null, + marital_status enum("married", "divorced", "single") not null default "unknown", + net_worth_in_millions decimal not null, + primary key (id) + ); + ``` +- we can use `default` while inserting data to instruct mysql to use the default value. it would work for auto increment id as well. we can also not specify the column name altogether +- insert into table by not specifying id - + ```sql + insert into actors (first_name, second_name) values ("jennifer", "aniston"); + ``` +- insert into table by specifying id which is auto increment - + ```sql + insert into + actors (first_name, second_name, id) + values + ("jennifer", "aniston", default), + ("johnny", "depp", default); + ``` +- querying in tables by selecting all columns - + ```sql + select * from actors; + ``` +- select specific columns and filter results using `where` clause - + ```sql + select first_name, second_name from actors where first_name = "tom"; + ``` +- we have a lot of operators in mysql, look [here](https://dev.mysql.com/doc/refman/8.0/en/non-typed-operators.html) +- we can use the `like` operator with where clause for pattern matching. `_` can be used to match exactly one character, `%` can be used to match 0 or more characters - + ```sql + select * from actors where first_name like '_enn%'; -- matches jennifer + ``` +- we can use `cast` to change data type +- e.g. order query results by number, but number would be treated as strings i.e. 2 > 10 + ```sql + select * from actors order by cast(age as char); + ``` +- we can `limit` the number of results returned, and `offset` it from a certain point. note: sql will automatically handle even if our limit or offset goes beyond the number of rows by giving back sensible results + ```sql + select first_name from actors order by age desc limit 4 offset 3; + ``` +- delete selective rows - + ```sql + delete from actors where gender = "male" order by age desc limit 3; + ``` +- for deleting all rows, a faster method is `truncate actors`, it would delete the table entirely and recreate it +- update selective rows - + ```sql + update actors set age = 25 order by first_name limit 3; + ``` +- we can alter name and data type of column, provide a default value. note: while altering data type, the new and old data types should be compatible - + ```sql + alter table actors change first_name firstName varchar(20) default "anonymous"; + ``` +- adding a column - + ```sql + alter table actors add first_name varchar(20); + ``` +- deleting a column - + ```sql + alter table actors drop first_name; + ``` +- indices help in querying data efficiently, just like we search for words in a dictionary. downside is the overhead of creating, storing and maintaining these indices. internally, mysql uses b / b+ trees with the keys of the nodes as primary indices. this helps in efficient querying of data +- we can create an index on name to speed up queries - + ```sql + alter table actors add index index_name (first_name); + ``` +- we can also drop that created index - + ```sql + alter table actors drop index index_name; + ``` +- alter table name - + ```sql + alter table actors rename Actors; + ``` +- delete table - + ```sql + drop table if exists actors; + ``` +- aliases can be used to give temporary names, as they help us write queries that are more readable + ```sql + select + t1.first_name as a, t2.first_name as b + from + actors as t1, actors as t2 + where + t1.net_worth_in_millions = t2.net_worth_in_millions and t1.id > t2.id; + ``` +- distinct is a post-processing filter i.e. works on the resulting rows of a query & can be used on multiple columns + ```sql + select distinct first_name, last_name from actors; + ``` +- aggregate methods like `min`, `max`, `sum`, `count` can be used - + ```sql + select count(*) from actors; + ``` +- group by - helps group rows based on a particular column. we cannot use columns **not** present in group by for select, having, or order by clauses + ```sql + select gender, avg(net_worth_in_millions) from actors group by gender; + ``` +- while the where clause helps us filter rows, the having clause helps us filter groups + ```sql + select + marital_status, avg(net_worth_in_millions) as avg_net_worth_in_millions + from + actors + group by + marital_status having avg_net_worth_in_millions > 200 + ``` +- adding a foreign key constraint - + ```sql + alter table digital_assets + add constraint digital_assets_actor + foreign key (actor_id) references actors(id); + ``` + +## Joins + +- **cross join** - cartesian product of the rows of the two tables +- **inner join** - all rows of both the tables where the condition (called the join predicate) is satisfied +- **left outer join** - result of inner join + all rows of the left table, with null for the columns of the right table +- **right outer join** - result of inner join + all rows of the right table, with null for the columns of left table +- **full outer join** - result of inner join + all rows of the left table, with null for the columns of the right table + all rows of the right table, with null for the columns of the left table +- **self join** - using the same table on both sides of the join +- inner join example - assume digital_assets table contains social media links, where the asset_type is an enum containing twitter etc. and url is the link + ```sql + select + actors.first_name, actors.second_name, digital_assets.asset_type, digital_assets.url + from + actors inner join digital_assets + on + actors.id = digital_assets.actor_id; + ``` + if the same column name is not there in the two tables, the "table." prefix can be removed e.g. `first_name` in place of `actors.first_name`, though i prefer being explicit +- the above query can be rewritten as below, with **no** performance impact + ```sql + select + actors.first_name, actors.second_name, digital_assets.asset_type, digital_assets.url + from + actors, digital_assets + where + actors.id = digital_assets.actor_id; + ``` +- union clause - merely clubs results together, doesn't join the tables. e.g. the following query will display a list of all actress names, followed by all male actor names + ```sql + select concat(first_name, ' ', last_name) from actors where gender = 'female' + union + select concat(first_name, ' ', last_name) from actors where gender = 'male' + ``` + note: duplicates are automatically removed since it is a "union", which can be prevented using `union all` +- left outer join syntax (right join would have similar syntax, not discussed). e.g. in the below query, actors without social media handles would be displayed too, with the columns for `asset_type` and `url` holding null - + ```sql + select + actors.first_name, actors.second_name, digital_assets.asset_type, digital_assets.url + from + actors left outer join digital_assets + on + actors.id = digital_assets.actor_id; + ``` +- natural join - syntactic sugar, no need to explicitly specify the columns to use for join, i won't use it + +## Nested Queries + +- nested queries are slower but sometimes the only way to write a query +- the following is an example of **nested scalar query**, since the nested query returns a single value. e.g. find all actors who had updated their digital assets most recently + ```sql + select + first_name + from + actors inner join digital_assets on digital_assets.actor_id = actors.id + where + digital_assets.last_updated = ( + select max(digital_assets.last_updated) from digital_assets + ); + ``` +- e.g. find all actors who are on facebook + ```sql + select * from actors where id in ( + select actor_id from digital_assets where asset_type = 'facebook' + ) + ``` +- e.g. find actors who updated their social handles on their birthday + ```sql + select + actors.first_name + from + actors inner join digital_assets + on + actors.id = digital_assets.actor_id and + actors.dob = digital_assets.last_updated + ``` +- the following is an example of a nested query where it returns a collection of columns. the query returns the same results as the example as above + ```sql + select first_name from actors where (id, dob) in + (select actor_id, last_updated from digital_assets); + ``` + +## Correlated Queries + +- the subquery references columns from the main query +- note: we can use the `exists` operator to check if the subquery returns any rows +- e.g. find actors with their names in their twitter handles - + ```sql + select + actors.first_name + from + actors inner join digital_assets + on + actors.id = digital_assets.actor_id + where + digital_assets.url like concat('%', actors.first_name, '%') and + digital_assets.asset_type = 'twitter' + ``` +- the query returns the same results as the example as above + ```sql + select first_name from actors where exists ( + select + * + from + digital_assets + where + digital_assets.actor_id = actors.id and + digital_assets.url like concat('%', actors.first_name, '%') and + digital_assets.asset_type = 'twitter' + ) + ``` +- difference between nested queries and correlated queries - in nested queries, the subquery runs first and then the main query runs. in correlated queries, the subquery runs for every row of the main query, and the subquery runs after the main query + +## Multi Table Operations + +- multi table delete use case - delete related data from multiple tables + ```sql + delete + actors, digital_assets -- tables to delete rows from + from + actors, digital_assets + where + actors.id = digital_assets.actor_id and + digital_assets.asset_type = 'twitter' + ``` + we mention the tables to delete rows from, note how this isn't required when deleting from one table +- we can similarly have multi table updates - + ```sql + update + actors inner join digital_assets + on + actors.id = digital_assets.actor_id + set + actors.first_name = upper(actors.first_name) + where + digital_assets.asset_type = 'facebook' + ``` +- note: a subquery cannot have select for tables being updated or deleted in the outer query +- copy a table **without the data** and just the structure - `create table copy_of_actors like actors` +- insert data from one table into another - `insert into copy_of_actors(name) select first_name from actors` + +## Views + +- views can be created by combining multiple tables +- we can filter out rows and columns +- now, a complex query becomes a simple single table query +- we can create views from other views as well, and we can perform the same joins and filtering on views that we would otherwise perform on a table +- when we do `show tables`, we see the views as well, we can see the type of table i.e. whether it is a normal table (also referred to as base table) or a view by using the command `show full tables` +- e.g. of creating a view - + ```sql + create view actors_twitter_accounts as + select + first_name, second_name, url + from + actors inner join digital_assets + on + actors.id = digitalassets.actor_id + where + asset_type = 'twitter' + ``` +- views are basically like stored queries, so they get updated whenever the tables get updated +- we can use `create or replace` to either create a view or replace it if one already exists. e.g. for single actors + ```sql + create or replace view single_actors as + select * from actors where marital_status = 'single'; + ``` +- we can update or delete rows from the underlying base tables using views. however, there are conditions e.g. it shouldn't have specific types of joins, group by statements or aggregation functions, etc. + ```sql + insert into single_actors + (first_name, second_name, dob, gender, marital_status, net_worth_in_millions) + values + ('charlize', 'theron', '1975-08-07', 'female', 'single', 130); + ``` +- e.g. i try inserting a row into this view, which fails the filtering clause used to create the view + ```sql + insert into single_actors + (first_name, second_name, dob, gender, marital_status, net_worth_in_millions) + values + ('tom', 'hanks', '1956-07-09', 'male', 'married', 350); + ``` +- now, since views can update their base tables, this went through and updated the table. however, since the view's query filters out married actors, we don't see the row in the view. we have essentially updated a row in a table through a view which will not be visible in the view. if this behavior is not desirable, we can use the check option while creating the view + ```sql + create or replace view single_actors + as select * from actors where marital_status = 'single' + with check option; + ``` +- now the insert statement for tom hanks will fail +- if we create views using other views, the check option can have scopes of **local** and **cascade**. local means that only the check option of the view being used for the update will be considered, while cascade looks at the check option of the views being used by this view itself as well +- we can drop views using `drop view single_actors` + +## Triggers + +- triggers are statements that get invoked when we perform an operation like insert, update or delete +- note: if we perform an operation like truncate which is equivalent to delete, triggers won't be invoked +- triggers can be **row level** or **statement level** +- row level triggers are invoked once per row, e.g. if a statement updated 25 rows then it gets invoked 25 times, while statement level triggers are invoked once per statement +- triggers can be invoked at 6 phases - (before, after) * (insert, update, delete) +- e.g. of trigger - + ```sql + delimiter ** + create trigger net_worth_check + before insert on actors + for each row + if new.net_worth_in_millions < 0 or new.net_worth_in_millions is null then + set new.net_worth_in_millions = 0; + end if; + ** + delimiter ; + + insert into actors (first_name, net_worth_in_millions) values ('tom', 350); + insert into actors (first_name, net_worth_in_millions) values ('young', null); + insert into actors (first_name, net_worth_in_millions) values ('old', -540); + + select * from actors; -- actors young and old will have net_worth_in_millions adjusted to 0 + ``` +- show triggers - `show triggers;` +- drop triggers - `drop trigger if exists net_worth_check;` +- we can also include multiple statements by enclosing statements after `for each row` inside a begin-end block + +## Transactions + +- we use transactions since we want either all the statements or none of them to go through +- there can be storage engines which don't support transactions / apply locking using different methods +- irrespective of whether transactions are supported, databases should have some form of locking to disallow concurrent access from modifying the data. e.g. InnoDB supports row level locking so that multiple users can modify the data in the same table. this also makes it a little slower +- we can start and commit a transaction using - + ```sql + start transaction; + -- statements + commit; + ``` +- we can roll back a transaction using + ```sql + start transaction; + -- statements + rollback; + ``` diff --git a/_posts/2023-07-23-snowflake.md b/_posts/2023-07-23-snowflake.md new file mode 100644 index 0000000..86dcd90 --- /dev/null +++ b/_posts/2023-07-23-snowflake.md @@ -0,0 +1,609 @@ +--- +title: Snowflake +--- + +## Snowflake + +- we can create and run queries inside worksheets +- we can see the snowflake_sample_data database by default with some sample data + ```sql + select * from snowflake_sample_data.tpch_sf1.customer + ``` +- we use snowflake's virtual warehouses for mpp (massive parallel processing). this allows the query to be processed in parallel in small chunks +- virtual warehouse sizes - xs (1 server), s (2 servers), m (4 servers),...4xl (128 servers) +- when not in use, a warehouse can be suspended. configure this behavior automatically using auto suspend and auto resume +- creating warehouses can be done by using the ui / even issuing sql statements in the worksheet +- in the worksheet, we can set the context i.e. the current database, schema and warehouse or use commands like `use warehouse` or `use database` +- multi clustering - when creating a warehouse, we can create it as a cluster and set the minimum and maximum number of warehouses allowed +- i think multi clustering is an enterprise feature, i do not see the option for it in the ui +- based on when load is high, this cluster will automatically add or remove warehouses for us (think asg in aws) +- multi clustering is better for more number of queries, for more complex queries we might need to consider vertical scaling (increase size of warehouse) +- notice difference between **more number of queries** vs **more complex queries** +- so basically there is a queue of the queries, which gets assigned one by one to the warehouses +- scaling policy - standard is the default whereas economy preserves the cost + - standard - add additional virtual warehouses if there is a task queued + - economy - add additional virtual warehouses if the estimated time for the current cluster is at least 6 minutes +- optimize virtual warehouse usage - + - have dedicated virtual warehouses for different use cases since they have different workload types + - understand horizontal scaling (more concurrent queries) vs vertical scaling (more complex queries), and choose the right one based on use case +- snowflake editions - **standard**, **enterprise** (has all features of standard with additional features like multi cluster warehouse, time travel upto 90 days as opposed to the 1 day inside standard, materialized views, etc), **business critical** (all features of enterprise with extended support etc) and **virtual private** (think dedicated hosts in aws ec2) +- we are charged for storage (after compression) and compute (warehouse) +- for storage, we have two options to choose between - **on demand** (pay for what you use) and **capacity** (pay upfront) +- 40$ per tb per month for on demand storage, 23$ per tb per month for capacity storage +- for xs virtual warehouse, we consume 1 credit per hour consumed by second i.e. if we consume for half an hour, we use half a credit (minimum is one minute). number of credits consumed by a warehouse depends on size (1 credit per server, so medium would consume 4 credits per hour) +- for virtual warehouse, we are charged in terms of credits. there is a conversion of credit and dollars associated with it. e.g. for cloud provider as aws and in the us-east-1 region - 2$ per credit for compute if using standard edition +- methods of loading data in snowflake - + - **bulk / batch loading** - uses our compute. e.g. copy command + - **continuous loading** - doesn't use our compute, serverless. e.g. snowpipe +- stages - location from where data can be loaded + - **external** - maintains url, access credentials, etc. e.g. s3 buckets + - **internal** - local storage maintained by snowflake +- note - there are costs considerations around data transfer when moving data from different regions or different clouds vs same cloud and same region +- creating a stage - + ```sql + create or replace database our_first_db; + + create or replace database manage_db; + create or replace schema manage_db.external_stages; + + create or replace file format manage_db.external_stages.csv_format + type = csv field_delimiter = ',' skip_header = 1; + + create or replace stage manage_db.external_stages.bucketsnowflakes3 + url = 's3://bucketsnowflakes3'; -- the bucket is unprotected + + list @manage_db.external_stages.bucketsnowflakes3; -- lists files + + create or replace table our_first_db.public.orders ( + order_id varchar(30), + amount int, + profit int, + quantity int, + category varchar(30), + subcategory varchar(30) + ); + + copy into our_first_db.public.orders + from @manage_db.external_stages.bucketsnowflakes3 + file_format = manage_db.external_stages.csv_format + files = ('OrderDetails.csv'); + ``` +- doing some transformations before loading data - + ```sql + copy into orders_ex (order_id, profit, profitable_flag) from ( + select + s3.$1, + s3.$2, + iff(cast(s3.$3 as int) > 0, 'profitable', 'non profitable') + from + @manage_db.external_stages.bucketsnowflakes3 s3 + ) + file_format = manage_db.external_stages.csv_format + files = ('OrderDetails.csv'); + ``` +- instead of `files` where we specify the full names of the files in an array like structure, we can specify a regex to match file names using the `pattern` keyword +- lets say we have a column of type integer in the create table statement, but the data in the csv inside s3 is bad and one of the rows in the csv has a string for the corresponding column. we can configure the behavior on encountering an error as follows - + ```sql + -- ... + files = ('OrderDetails_error.csv') + on_error = skip_file; + ``` + the options for `on_error` are - + - **abort_statement** - the default. abort the copying and rollback the rows copied + - **continue** - skip the row where the error happened and continue the loading of data + - **skip_file** - skip the file where the error happened but continue loading other files. we can also configure the error limit per file in this case, e.g. **skip_file_3** would mean skip the file if three or more errors happen (so skip_file actually means skip_file_1?) +- before actually copying over the data, we can also do a dry run of the copy - this way we can know beforehand if the copying will go through without actually executing it. we configure this using **validation_mode** i.e. if we provide this option, the data is not actually copied + ```sql + -- ... + files = ('OrderDetails_error.csv') + validation_mode = return_errors; + ``` + the two options are - + - **return_errors** - returns all errors if any during the execution of the entire thing. the output will contain the files where the error occurred, the row number, the reason of the error, etc + - **return_n_rows** - e.g. return_5_rows would mean perform the validation on only the first 5 rows, and if a failure occurs, throw the exception, and if no, return these 5 rows. note - the difference is this returns the processed rows while the above returns files where exceptions occurred +- if column has type `varchar(10)` but the source csv column has values of larger lengths, the copy command will fail. we can prevent this failure by using `truncatecolumns = true`, so that columns with greater lengths are just truncated i.e. electronics will become electronic +- by default, if we rerun the same copy command more than once, the rows will not be duplicated 🤯. we can change this behavior by providing `force = true`. note that this can lead to duplicates +- to view the history of copy commands i.e. source stage, success vs failure count, etc, use - + ```sql + select * from copy_db.information_schema.load_history; + ``` +- note that the command above was for a single database. to view the same thing across databases, use the snowflake db + ```sql + select * from snowflake.account_usage.load_history; + ``` +- for loading unstructured data (e.g. json), we might not be able to load it directly like above i.e. csv rows were easily mapping one to one with table rows +- so, we first load the json to a new table which has only one column of type `variant` +- we then transform this data (e.g. flatten) to load into our own tables + ```sql + create or replace stage manage_db.public.s3_json + url = 's3://bucketsnowflake-jsondemo'; + + create or replace file format manage_db.public.json + type = json; + + create or replace table our_first_db.public.json_demo ( + raw_json variant + ); + + copy into our_first_db.public.json_demo + from @manage_db.public.s3_json + file_format = manage_db.public.json + files = ('HR_data.json'); + ``` +- now, assume the json has the format as below - + ```json + { + "city": "Louny", + "first_name": "Dag", + "gender": "Male", + "id": 2, + "job": { + "salary": 43000, + "title": "Clinical Specialist" + }, + "last_name": "Croney", + "prev_company": [ + "MacGyver, Kessler and Corwin", + "Gerlach, Russel and Moen" + ], + "spoken_languages": [ + { "language": "Assamese", "level": "Basic" }, + { "language": "Papiamento", "level": "Expert" }, + { "language": "Telugu", "level": "Basic" } + ] + } + ``` +- we can for e.g. query city as follows - + ```sql + select raw_json:city from our_first_db.public.json_demo; + ``` +- recall raw_json was the variant column in our table. the output for e.g. of above would be a column containing cells of the format `"Bakersfield"`. so, now to convert this to a string i.e. `Bakersfield` (without quotes), we can do the below - + ```sql + select raw_json:city::string from our_first_db.public.json_demo; + ``` +- for nested object e.g. refer job in the json, this would work - + ```sql + raw_json:job.salary::int job_salary + ``` +- for nested arrays e.g. refer languages in the json, this would work - note how we can only grab one language at a time since this is like one to many + ```sql + raw_json:spoken_languages[0].language::string first_language + ``` +- so, the above solution works for arrays if we are fine with introducing new columns like first_language, second_language, etc +- but what if we want a table that is like if we had to perform a join between employee data and spoken languages - + ```sql + select + json_demo.raw_json:first_name::string first_name, + flattened.value:language::string language + from + our_first_db.public.json_demo json_demo, + table(flatten(raw_json:spoken_languages)) flattened; + ``` +- the output of this command would look like this - + + | first_name | language | + | ---------- | ---------- | + | Portia | Kazakh | + | Portia | Lao | + | Dag | Assamese | + | Dag | Papiamento | + | Dag | Telugu | + +- now theoretically we could have done as below - + ```sql + select + raw_json:first_name::string first_name, + raw_json:spoken_languages[0].language::string language + from + our_first_db.public.json_demo + union all + select + raw_json:first_name::string first_name, + raw_json:spoken_languages[1].language::string language + from + our_first_db.public.json_demo + union all + select + raw_json:first_name::string first_name, + raw_json:spoken_languages[2].language::string language + from + our_first_db.public.json_demo; + ``` +- _notice the index of spoken_languages above_. the downside is the output would be as follows i.e. there would be nulls inside the language row for people having less than three languages + + | first_name | language | + | ---------- | ----------- | + | Portia | Kazakh | + | Portia | null | + | Portia | null | + | Dag | Assamese | + | Dag | Papiamento | + | Dag | Telugu | + +- caching - snowflake has caching enabled by default, and it is cached for 24 hrs. to ensure this however, ensure that queries go on the **same warehouse**. this is why having dedicated virtual warehouses for dedicated groups can help +- we can confirm if the cache was used by clicking on the query id - it shows table scan + aggregation for the first time, and shows query result reuse the second time onwards +- clustering - snowflake creates cluster keys for columns to create micro partitions. this prevents full table scans +- we can explicitly do clustering + - do it for columns which are usually used in where clauses + - do it for columns frequently used in joins (similar to above) + - avoid extremes - + - not useful for columns which have too many unique values, e.g. id + - not useful for columns which have too less unique values, e.g. gender +- we can confirm clustering performance by clicking on the query id - it shows how many total partitions are there and how many partitions are used +- connecting to s3 securely using integration objects - + - create an iam role - + - select the trusted entity as the same account id in which this role is being created + - select the requires external id parameter and enter a random value here for now + - above steps result in a trust policy like below. note that both values entered above are placeholders for now - + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::8502136:root" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "something-random" + } + } + } + ] + } + ``` + - create an integration object inside snowflake - + ```sql + create or replace storage integration snowflake_s3_demo + type = external_stage + storage_provider = s3 + enabled = true + storage_aws_role_arn = 'arn:aws:iam::8502136:role/SnowflakeDemo' + storage_allowed_locations = ('s3://snowflake-demo-3b98x97') + ``` + - run `describe storage integration snowflake_s3_demo` and copy the values under `STORAGE_AWS_IAM_USER_ARN` and `STORAGE_AWS_EXTERNAL_ID`. replace the values in the trust policy for principal and external id with this + - now, we can use the integration object when creating a stage - + ```sql + create or replace stage manage_db.external_stages.csv_folder + url = 's3://snowflake-demo-3x7' + storage_integration = snowflake_s3_demo + file_format = manage_db.file_formats.csv_fileformat + ``` +- snowpipe - enables loading of data automatically when for e.g. a new file is added to the s3 bucket +- snowpipe is serverless i.e. our compute is not used for this +- this near realtime ability is achieved via s3 notifications sent to snowflake managed sqs queue +- setting up a snowpipe - + - create a pipe - + ```sql + create pipe snowpipe_demo.public.s3 + auto_ingest = true as + copy into snowpipe_demo.public.employee + from @snowpipe_demo.public.s3_csv + file_format = snowpipe_demo.public.csv + pattern = '.*employee.*\.csv'; + ``` + - run the describe command to grab the queue arn - `describe pipe snowpipe_demo.public.s3` + - set up event notification on the s3 bucket with this sqs arn as the destination +- to view pipes, use `show pipes` or we can specify database as well using `show pipes in database snowpipe_demo` +- to make changes to the pipe, pause it first - + ```sql + alter pipe snowpipe_demo.public.s3 set pipe_execution_paused = true; + ``` +- even if we want to make changes to data, e.g. to have existing files picked up the snowpipe, pause the snowpipe before running the copy command manually to load the data of existing files +- time travel - e.g. we make an erroneous update like this - + ```sql + update test set first_name = 'Shameek'; + ``` +- we can now go back in time to look at what the data looked like before the erroneous update - + ```sql + -- go back a specific amount of seconds + select * from test at (offset => -60 * 2); + -- OR go back to a certain timestamp + alter session set timezone = 'UTC'; + select current_timestamp; + select * from test at (timestamp => '2023-07-28 03:36:21.779'::timestamp); + -- OR before a certain query (the erroneous update in this case) was executed + select * from test before (statement => '01adeb9c-0604-af37-0000-007bd70792b5'); + ``` +- note - for the `before` statement query issued above, snowflake has a history of all queries executed which we can see in the ui +- e.g. of restoring - + ```sql + truncate table test; + insert into test ( + select * from test before (statement => '01adebc9-0604-af9c-0000-007bd707b315') + ); + ``` +- optionally, load the time traveled data into a backup table and then load it from here into the original table instead of loading the data into the original table directly as described above +- if we accidentally drop a table / schema / database, we can run the undrop command, e.g. `undrop table test` to restore it + - optionally, if we accidentally run `create or replace table test...`, we can restore the test table before the replace command was executed by first renaming the current wrongly instantiated table, e.g. `alter table test rename to test_aux`, and then running the undrop command to restore the test table before the replace to our database +- we can go back upto 90 days in editions enterprise and above, and upto 1 day in standard edition. however, the default is set to 1. therefore, we have to change the retention period manually to 90 days for editions other than standard - + ```sql + alter table test_tt set data_retention_time_in_days = 2; + ``` +- failsafe - protection of historical data in case of a disaster +- the failsafe period **starts after the time travel period ends** +- the failsafe period is for 7 days +- this is not queryable / usable by front users like in time travel. the idea is to reach out to snowflake support after a disaster occurs to restore the table to a previous state +- the failsafe period cannot be configured like time travel +- table type - table type is a property of the table. the different table types are - + - permanent tables - this is the default. we have both time travel (0-90 days) and failsafe + - transient tables - we have time travel (0-1 day). but no failsafe + ```sql + create or replace transient table -- ... + ``` + - temporary - we have time travel (0-1 day) but no failsafe. note - this is only scoped to a session i.e. we loose this table when the session is closed / cannot view it from other sessions + ```sql + create or replace temporary table -- ... + ``` +- the types above are not only scoped to a table, but to database / schemas as well +- we would pay for additional storage for failsafe / time travel, so use transient tables for "reproducible" data like staging layer of warehouse? +- **zero copy cloning** - when we use the clone command, the new table reuses the data and metadata of the older table. this way, it is cost efficient. the additional updates however do not effect one another +- we can clone storage objects (databases, tables, schemas) and stages, file formats, tasks, etc +- we can use time travel with cloning as well - + ```sql + create table cloned + clone source + before (timestamp => ...) + ``` +- swap table / schemas - swaps the underlying metadata and data as well + ```sql + alter table swap_demo.public.development + swap with swap_demo.public.production; + ``` +- data sharing - data is not copied again, so it is automatically immediately up to date for the consumer +- snowflake users it is shared with have to use their own compute resources for this +- creating a share - + ```sql + create or replace share orders_share; + grant usage on database data_share_demo to share orders_share; + grant usage on schema data_share_demo.public to share orders_share; + grant select on table data_share_demo.public.orders to share orders_share; + ``` +- add account to share - + ```sql + alter share orders_share add account = <>; + ``` +- create a database from the share inside the consumer account - + ```sql + create database orders_db from share <>.orders_share; + ``` +- now, the consumer can start consuming the data from this newly created database +- till now, we assumed that the consumers have their own snowflake account when sharing data. non snowflake users can access shares via a reader account. however, our compute is used in this case +- create a reader account + ```sql + create managed account analytics + admin_name = analytics + admin_password = 'P4$$w0rcl' + type = reader; + ``` +- add the reader account to the share - + ```sql + show managed accounts; -- use the value of "locator" for the value below + alter share orders_share add account = QBB35692; + ``` +- in the reader account, create database from share - + ```sql + show shares; + create database orders_db from share <>.orders_share; + ``` +- create a virtual warehouse inside the reader account (looks like parent account virtual warehouses and reader account virtual warehouses are not exposed to each other?) +- for granting select on all tables in a database / schema - + ```sql + -- instead of + grant select on table data_share_demo.public.orders to share orders_share; + -- do + grant select on all tables in database data_share_demo to share orders_share; + -- or + grant select on all tables in schema data_share_demo.public to share orders_share; + ``` +- views - e.g. instead of sharing all data, we want to share some restricted data. we can do this via views. e.g. - + ```sql + create or replace view data_share_demo.public.loan_payments_cpo as ( + select loan_id, principal + from data_share_demo.public.loan_payments + where loan_status = 'COLLECTION_PAIDOFF' + ); + ``` +- however, the issue with the above is for e.g. if we grant a role select on this view, and if a user with that role runs the command `show views`, they can view things like view definition. ideally, i would not have wanted to expose the fact that loan_status maintains an enum, since it is not even present in the projection clause of the view creation statement +- creating a secure view - `create or replace secure view...` +- note - we cannot use shares with normal views, we have to use secure views +- data sampling - use a subset of dataset when for e.g. testing workflows out +- two methods of sampling in snowflake - + - row or bernoulli method - every row is chosen with a probability of percentage p. so, it maybe more random since continuous rows are not chosen + ```sql + select * from snowflake_sample_data.tpcds_sf10tcl.customer_address + sample row (1) seed (25); -- seed helps reproduce same results when using randomness + ``` + - block or system method - every block is chosen with a probability of percentage p. so, it maybe a bit more quicker, since it uses micro partitions + ```sql + select * from snowflake_sample_data.tpcds_sf10tcl.customer_address + sample system (1) seed (25); + ``` +- tasks - it stores an sql statement that can be scheduled to be executed at a certain time or interval + ```sql + create or replace task task_db.public.customer_insert + warehouse = compute_wh + schedule = '1 minute' + as + insert into customers (created_date) values (current_timestamp); + ``` +- notice how tasks use our compute unlike snowpipe, materialized views, etc? +- on running `show tasks`, feels like tasks are suspended by default. so, run the following - + ```sql + alter task task_db.public.customer_insert resume; + ``` +- for crons, - `schedule = 'USING CRON * * * * * UTC'` +- tree of tasks - a root task, which can then have children (multiple levels are allowed). one child task can have one parent task, but one parent task can have multiple children. when declaring a child task, instead of `schedule`, we use `after task_db.public.parent_task` +- note - i think the parent task needs to be suspended first i.e. we first suspend the parent task, create and resume the child task and then finally resume the parent task, else we get an error. even as a best practice that feels right +- getting execution history of tasks like errors, completion time, etc. it also has records for the next queued execution + ```sql + select * from table(task_db.information_schema.task_history(task_name => 'customer_insert')); + ``` +- tasks can also have a `when` clause, and the task is executed only if the condition evaluates to true, else the task is skipped +- streams - helps with cdc (change data capture) to capture the delta (changes) of the source data. so, streams help capture dml (i.e. crud) changes +- we only pay for the storage of metadata columns of the stream that helps determine whether the row was deleted, updated, etc. the rows in streams reference the original source for the actual data +- create a stream - + ```sql + create or replace stream streams_demo.public.sales_raw_stream + on table streams_demo.public.sales_raw; + ``` +- we can run select on the stream table just like we would on a normal table + ```sql + select * from streams_demo.public.sales_raw_stream; + ``` +- the stream has three additional columns - `METADATA$ACTION`, `METADATA$ISUPDATE`, `METADATA$ROW_ID` +- once we process the stream, the data in the stream is deleted. it feels like stream is like an "auto generated temporary staging layer" of the warehouse. e.g. if i insert into a table by running a select on the stream table, the stream table clears up +- an update corresponds to two rows in streams - an insert and a delete for `METADATA$ACTION`, and true for `METADATA$ISUPDATE` in both rows. so, `METADATA$ACTION` is always either insert or delete, and we need to determine if the change is due to an update using `METADATA$ISUPDATE` +- e.g. of using streams - imagine store is a static reference table. we want to process the changes in sales table to a table used for analytics, that is like a join between sales and store tables. so, we can assume that for every record in the sales table, there would be a record in this sales analytics table, with added information about the store. so, the stream is needed for the sales table, and not the store table, and we update the final table used for analytics by joining the sales stream table and store reference table + ```sql + create or replace stream streams_demo.public.sales_raw_stream + on table streams_demo.public.sales_raw; + + merge into streams_demo.public.sales_final sf + using ( + select sa.*, st.employees, st.location + from streams_demo.public.sales_raw_stream sa + join streams_demo.public.store_raw st + on sa.store_id = st.store_id + ) src + on src.id = sf.id + when + matched + and src.METADATA$ACTION = 'DELETE' + and not src.METADATA$ISUPDATE + then delete + when + matched + and src.METADATA$ACTION = 'INSERT' + and src.METADATA$ISUPDATE + then update set + sf.product = src.product, + sf.price = src.price, + sf.amount = src.amount, + sf.store_id = src.store_id, + sf.location = src.location, + sf.employees = src.employees + when + not matched + and src.METADATA$ACTION = 'INSERT' + and not src.METADATA$ISUPDATE + then insert values ( + src.id, + src.product, + src.price, + src.amount, + src.store_id, + src.location, + src.employees + ); + ``` +- we can use streams in the `when` clause of tasks! so, we can pretty much build an entire etl pipeline just using snowflake - + ```sql + when system$stream_has_data('stream-name') + as -- the entire sql for stream processing defined above + ``` +- stream types - standard and append-only. append-only captures only inserts while standard captures inserts, updates and deletes. default is standard as seen above +- change tracking - tables have a change tracking property. we can set it to true as follows - + ```sql + alter table names set change_tracking = true; + ``` +- now, with change tracking enabled, we can basically see the changes in a table in the same format as we saw in streams - + ```sql + select * from names + changes (information => default) + at (offset => -240); + ``` +- my understanding - **the difference is that unlike streams, this does not get deleted. its almost like we have a rolling window of cdc until the time travel / retention period** +- again - notice the use of default in the changes clause above. we can also use append_only instead +- materialized view - if we run an expensive query frequently, it can lead to bad user experience. so, we can instead use materialized views + ```sql + create or replace materialized view orders_mv as + -- ... + ``` +- so, materialized views are updated automatically when its base tables are updated. this updating is maintained by snowflake itself. when we query using materialized view, data is always current +- this means that if the materialized view has not been updated completely by the time we initiate a query, snowflake will use the up to date portions of the materialized view and fetch the remaining data from the base tables +- since background services of snowflake are being used for updating materialized views, it adds to the cost independent of our virtual warehouses +- use materialized views if data is not changing frequently and view computation is expensive. if data is changing frequently, use change tracking / streams + tasks +- [has a lot of limitations i think 😭](https://docs.snowflake.com/en/user-guide/views-materialized#limitations-on-creating-materialized-views) - joins, some aggregation functions, having clause, etc are not supported at the time of writing +- dynamic data masking - returns masked results for security purpose, e.g. pii (personally identifiable information) + ```sql + create or replace masking policy phone + as (val varchar) returns varchar -> + case + when current_role() in ('ACCOUNTADMIN') then val + else '#####' + end; + + alter table customers + modify column phone_number + set masking policy phone; + ``` +- some more masking policy examples - + - we just want to see the domain of the emails - + ```sql + when current_role() not in ('ACCOUNTADMIN') then regexp_replace(val, '+\@', '****@') + ``` + - we want to be able to do comparisons, e.g. we want to join by name, but we do not want to allow seeing of the names. we can use `sha2(val)`, so that while users see an encrypted value, it is a consistent hash, so running it on the same value will produce the same result + +### Access Management + +- rbac (role based access control) i.e. privileges are assigned to roles, which are inturn assigned to users +- in snowflake we have dac (discretionary access control) i.e. every object has an owner, who can grant access to that resource. so, all objects have an owner, which is a role, and this role has all privileges on that object by default. the objects on which we can grant privileges are also called securable objects, e.g. warehouses, databases, tables, etc +- role hierarchy - the parent role will automatically have the privileges of all of its child roles +- my understanding + - a user can have multiple roles + - the public role is assigned to all new users by default + - the default role is the one that determines what role to use when for e.g. a new worksheet is opened by the user, or maybe like when no role is specified + - for all roles to be used, set secondary role to all. e.g. we have a system account, which has warehouse access via a different role, and access to tables via yet another role. we cannot specify both roles in for e.g. the jdbc url. so, we can instead set the secondary role to all for permissions from all roles to kick in for a user anytime the user makes a query +- system defined roles - + - account admin - + - the top level role + - can manage things like reader accounts + - avoid using this, and users using this should use mfa + - do not create objects using this, as otherwise we would have to manually add privileges to users that need it (it is at the top of hierarchy so no role inherits "from" it) + - only account admin can view things like usage / billing information + - security admin - + - can manage any object grant globally - my doubt - does this mean it can do this for objects that it (or its nested children) do not own as well? + - can be used to create and manage roles but thats usually done by useradmin? + - example - (note the hierarchy i.e. sales_user is a child of sales_admin, which is inturn a child of sysadmin. this is a best practice) + ```sql + create or replace role sales_admin; + create or replace role sales_user; + + grant role sales_user to role sales_admin; + grant role sales_admin to role sysadmin; + + create or replace user simon_sales_user + password = 'p@$$worcl' + default_role = sales_user; + grant role sales_user to user simon_sales_user; + + create or replace user olivia_sales_admin + password = 'p@$$worcl' + default_role = sales_admin; + grant role sales_admin to user olivia_sales_admin; + ``` + - sysadmin - + - create warehouses, databases, etc + - custom roles should be attached to sysadmin as a best practice. this way, the objects created by these custom roles can be managed by sysadmin. otherwise, this would not be possible + - example - we run the below from inside sysadmin. despite us granting ownership to sales_admin, sysadmin can still perform all the operations on these objects since sysadmin inherits permissions from sales_admin. refer above, this setup was basically done by security admin + ```sql + create or replace database sales_db; + grant ownership on database sales_db to role sales_admin; + grant ownership on schema sales_db.public to role sales_admin; + ``` + - now, from inside sales_admin, we can run the below - + ```sql + grant usage on database sales_db to role sales_user; + grant usage on schema sales_db.public to role sales_user; + grant select on table sales_db.public.customers to role sales_user; + ``` + - useradmin - + - used to create / manage users and roles + - unlike securityadmin, it does not have ability to grant privileges to all objects, only on objects that it owns + - public role + - every user is granted this role by default + +![role hierarchy](/assets/img/warehouse-and-snowflake/role-hierarchy.png) diff --git a/_posts/2023-08-12-messaging-systems.md b/_posts/2023-08-12-messaging-systems.md new file mode 100644 index 0000000..ffa3f37 --- /dev/null +++ b/_posts/2023-08-12-messaging-systems.md @@ -0,0 +1,259 @@ +--- +title: Messaging Systems +--- + +## Kafka + +### Setup + +- note - environment should have java 8+ installed +- download the zip from [here](https://www.apache.org/dyn/closer.cgi?path=/kafka/3.5.0/kafka_2.13-3.5.0.tgz) +- unzip it - `tar -xzf kafka_2.13-3.5.0.tgz` +- note - the 2.13... here is not the kafka, but the scala version? + +### Staring using Zookeeper + +- in one terminal, start zookeeper - `zookeeper-server-start.sh ~/kafka_2.13-3.5.0/config/zookeeper.properties` +- in another terminal, start kafka - `kafka-server-start.sh ~/kafka_2.13-3.5.0/config/server.properties` + +### Starting using Kraft + +- generate a cluster uuid - `KAFKA_CLUSTER_ID="$(~/kafka_2.13-3.5.0/bin/kafka-storage.sh random-uuid)"` +- format log directories - `kafka-storage.sh format -t $KAFKA_CLUSTER_ID -c ~/kafka_2.13-3.5.0/config/kraft/server.properties` +- start kafka - `kafka-server-start.sh ~/kafka_2.13-3.5.0/config/kraft/server.properties` + +### Concepts + +- helps with system integrations. sources produce data into kafka, and targets consume from kafka +- distributed, resilient, fault tolerant +- created by linkedin, now maintained by ibm, cloudera, confluent, etc +- works with spark, flink, hadoop, etc +- a sequence of messages is called a data stream +- kafka topic - a particular stream of data +- a topic is identified by topic name +- topics support any kind of message format like json, avro, binary, etc +- we can produce data using kafka producers, and consume data using kafka consumers +- topics are split into partitions +- **messages within a partition are ordered** +- **messages in a partition get an id called offset**. note - so offsets are specific to a partition +- so, order is only guaranteed inside one partition +- **offsets are not reused in a partition even if previous messages are deleted from it** +- immutability - once data is written into a partition, it cannot be updated / deleted, we can just append (add) data to it +- my understanding - we basically interact with kafka producers and consumers in our code, and they internally do things like batching, where we provide network configuration, security parameters, etc +- producers can optionally send a key along with the message. this key can be a string, number, binary, etc +- if this key is null, then the message can end up in any partition +- if this key is not null, this key is hashed to produce the partition number. this partition number then determines the partition the message should go to. use case - e.g. we have a delivery service, where our trucks send its coordinates every 5 seconds. we should ensure that a truck sends its coordinates to the same partition to ensure ordering, therefore the truck can use its id as the kafka message key. messages with the same key end up in the same partition +- internally kafka partitioner determines the partition using murmur2 algorithm +- parts of a message - key, body, compression (e.g. gzip, snappy, etc or even none), headers (key value pairs) and a timestamp (can be set by the system or by the user) +- kafka message serializer - help in serializing our messages which are objects into bytes. e.g. if our key is an integer and our value is a string, kafka will use its inbuilt integer and string serializer respectively for this +- consumers - pull model i.e. consumers request for data from the brokers, and not the broker pushing data into the consumers +- consumers can deserialize using deserializers similar to serializers +- best practice - do not change serializer in the producer, since that will break the deserializers in the consumers. so, create a new topic instead and have the consumers to start pulling from this new topic +- **consumers in kafka read as a consumer group** +- **consumers in a group read from exclusive partitions** i.e. multiple consumers of the same group cannot read from the same partition +- so, if we have more consumers in a consumer group than the number of partitions, (number of consumers - number of partitions) consumers remain idle +- however, a consumer in a consumer group can read from multiple partitions (e.g. when number of partitions > number of consumers) +- of course consumers from different consumer groups can read from the same partition +- if suppose a consumer from a consumer group is removed, the partitions that consumer was responsible for is automatically distributed among the other members of that consumer group +- a consumer group id is used to help identify the consumers part of the same group +- consumer offset - **consumers store the offsets they have read up till in a topic called __consumer_offsets periodically**. this way, if they die and come back up, they can continue reading from the same position in the partition where they left off +- a kafka cluster has multiple kafka brokers. each broker is identified by an id +- **each broker only contains some partitions of a topic** - so data is distributed. understand the implication of this - **this way, our topic is not limited to scale by the capability of only one worker node** in our kafka cluster +- broker discovery mechanism - consumers do not need to connect to all brokers in advance. they only need to connect to one broker, and by that they are automatically able to connect to all brokers since on initiating connection with one broker, all metadata related to the other brokers, partitions, etc is sent +- topic replication factor - if a broker is down, another broker is still available to produce data to and receive data from. **replication factor = how many copies i.e. how many brokers will have the same partition's copy** +- in sync replicas (isr) - all replica brokers that have caught up with the broker +- since there are multiple partitions, there is a leader among these partitions, and producers can only send data to this leader +- consumers by default only consume from the leader. so i think the replication factor only helps with disaster recovery in this case +- however, in newer versions, kafka consumers can read from replica brokers as well, if the replica broker is closer to them (e.g. we should have the consumer read from the isr in same az and not the leader / another isr in a different az to help reduce network latency and costs). this feature is called rack awareness, and for this to work, `rack.id` on the broker should have the same value as `client.rack` on the consumer +- producer acknowledgements - + - acks = 0 means producer will not wait for acknowledgement + - acks = 1 means producer will wait for acknowledgements from leader. data can be lost if leader goes down unexpectedly before replication goes through to other brokers. it was the default earlier + - acks = all (or -1) means producer will wait for acknowledgement from all replicas along with the master as well. default kafka 3.x onwards + - this option goes hand in hand with the `min.insync.replicas` option, which states how many replicas should acknowledge the data. if its value is 1, it means that only the leader has to acknowledge the data + - so, one ideal configuration to start with would be setting min isr to 2, acknowledgement mode to -1 and setting replication factor to be 3. this way, at least one replica and the leader have the write before the producer can consider the message successfully written into kafka +- topic durability - if replication factor is m, and say we want isr to be n, then we can tolerate m - n brokers going down. so, for e.g. don't over optimize i.e. if min in sync replicas are 3, (acknowledgement mode is all) and replication factor is 3, that means we cannot withstand any broker going down, which might be too much +- retries - note - this is producer retries not consumer, don't confuse with concepts like dlq here 😅. retries here refer to transient failures like kafka saves the message but acks fail, required number of brokers (min insync replicas) are unavailable at the time so kafka cannot save the message, etc. focussing on the newer versions here - + - retries (`retries`) are set to infinite (2147483647) by default. so, after the producer sends the message and if there is a failure for some of the transient reasons discussed above, the producer would again retry sending the message + - idempotence (`enable.idempotence`) is set to true by default. imagine that kafka was able to save the message i.e. write it to the replication factor number of partitions, but the ack failed. so, the producer thinks that some stuff have failed and will retry sending. so, since this property is set to true, kafka would know not to re add this message to the partitions, and would just try sending the ack again. this helps with exactly once semantics (and not duplicating thus resulting in at least once). now, from what i understood, it also helps with ordering. so, if for example the producer sends the first batch and kafka fails to commit it, when the second batch is received by kafka, kafka would throw an out of order exception to the producer. with this property, its almost like a sequence number is sent with each batch. this way, both ordering and exactly once semantics are ensured + - max in flight requests (`max.in.flight.requests.per.connection`) is set to 5 by default. **this is basically how many concurrent requests producer will send without receiving the acknowledgements for them**. after this number, if our application calls send on the producer, it will start blocking. this needed to be 1 in older versions to maintain ordering, but with idempotence now, it is enough to keep this <= 5 based on what we discussed above and [this](https://docs.confluent.io/platform/current/installation/configuration/producer-configs.html#max-in-flight-requests-per-connection) + - delivery timeout (`delivery.timeout.ms`) is set to 120000 i.e. 2 minutes by default. now retries is infinite does not mean producer would just keep retrying endlessly in case of failure, since the time it first sent the message, it would keep retrying until this timeout occurs. again remember that this retrying decision is being done by the producer which we write, so we can configure it in the properties +- zookeeper - helps with managing multiple brokers. so, helps with issues like leader election, sending notifications to other brokers if a brokers goes down, etc +- kafka up to 2.x cannot work without zookeeper. however, kafka from 3.x can work without zookeeper using kraft, and kafka 4.x onwards will not use zookeeper at all +- zookeeper itself too runs in master slave mode, runs odd number of servers underneath +- because of this change of migrating away from zookeeper, we should not mention zookeeper configuration inside our connections, but only mention broker endpoints. this change can even be seen in the kafka cli etc, e.g. when running kafka-topics.sh, we do not specify the zookeeper endpoint. this way when we change from 3.x to 4.x, there would be slim to no change required from us +- understand how the offsets are associated to a consumer group on a per partition basis +- as we add / remove more consumers to a group, the existing consumers are notified of this and they accordingly adjust the partitions that they listen to +- when a new partition is added to a topic, this new partition also needs to be assigned to one of the consumers of a group subscribed to the topic +- partition rebalance - moving of partitions between consumers - can happen due to adding new partitions to the topic / adding or removing consumers in a group +- there are different strategies to partition rebalance (`partition.assignment.strategy`) - + - **eager rebalance** - all consumers give up their ownership i.e. the partition they were responsible for. then a fresh calculation is made and the consumers are randomly assigned the partitions again. issue - it might happen that an existing consumer now starts listening to a new partition. also, for albeit a brief period when the rebalancing is happening, there would be no consumers at all, this phenomenon where there are no consumers at all during a brief period is called stop the world event + - **cooperative rebalance / incremental rebalance** - process is uninterrupted for unaffected partitions, e.g. imagine consumer 1 was subscribed to partition 1, and consumer 2 was subscribed to partitions 2 and 3. if a new consumer is added, only for e.g. partition 3 would be reassigned to this new consumer, but data from partitions 1 and 2 continues flowing uninterrupted +- **static group membership** - by default, when a consumer leaves a group, the partition they owned is reassigned. we can specify a `group.instance.id` which makes the consumer a static member. this way there is no rebalance until `session.timeout.ms` (heartbeat mechanism discussed later), so the consumer has this much time to be able to come back up, otherwise the partition would be rebalanced. use case - consumers for e.g. maintain a cache and this way, a rebuilding of that cache is not required by the new consumer. feels like without this property, the partition would be reassigned to another consumer and not wait for the session timeout? +- quick question - how to implement a fan out pattern in kafka - do not assign the consumer group id / specify a different value for the consumer group id for each of your horizontally scaled instances - this way all the instances will receive the message +- producer compresses the batch of messages before sending it to the broker +- this helps with things like better utilization of disk on kafka, better throughput, etc +- compression can be specified at producer / topic level +- compression can be specified at producer level or the broker level as well using `compression.type` - + - producer - the default. use the compressed batch from the producer as is and write directly without recompression + - none - all batches are decompressed by the broker + - specify a type like lz4 explicitly. if the compression format is the same as done by the producer then store as is, else decompress and recompress using the specified format +- so, the summary of above according to my understanding is, leave compression type at broker level to be producer (it is the default), and set the compression type to be snappy or something at the producer config (default is none) +- batching settings - increasing batch sizes improves throughput, means lesser network calls, compression becomes more effective, etc. but of course it introduces latency for downstream consumers + - `linger.ms` - how long the producer should wait before sending the message to kafka. default is 0 + - `batch.size` - if the batch fills to this value before `linger.ms` is over, send the batch. default is 16 kb +- `partitioner.class` - in earlier versions of kafka, if we specify no key for our message, the messages are sent to partitions in round robin fashion using **round robin partitioner**. disadvantage - for e.g. remember batching happens at partition level, so this means we cannot utilize batching effectively, since there is a batch being created for every partition. **sticky partitioner** is the default in newer versions of kafka. this means that instead of round robbin, producer would fill one batch (until `linger.ms` or `batch.size`) and then send to one partition. after this, a new batch is started. so we can leave this property untouched in newer versions +- delivery semantics - this is for consumers + - at least once - default and usually preferred. commit offset after processing of message is over. if processing of message fails or imagine consumer crashes after receiving messages, message will be read again and reprocessed since the offset was not committed. so, the processing logic must be idempotent + - at most once - commit offset as soon as message is received. if processing of message fails or imagine that after receiving messages, the consumer crashes, messages will be lost and not read again. this case ensures a message would not be processed multiple times + - exactly once - this would only be possible if both source and sink is kafka. we use the transactional api in this case. e.g. when using kafka streams for transformations, we can use this +- to make our processing idempotent with at least once semantics, for a given message, we should add an id, e.g. imagine how we know for an object if it needs to be updated or created in the database based on its id property. otherwise, we can use kafka coordinates - every message will have a unique (topic + partition + offset) combination, so for e.g. we could generate an id like this - `__` (understand why a separator like _ is needed - otherwise there is no way to differentiate between partition 2 offset 22 and partition 22 offset 2) +- offsets are committed after at least `auto.commit.interval.ms` time has passed since us calling poll(). the default value of this is 5 seconds. my understanding - e.g. we poll every 7 seconds, and auto commit interval is 5 seconds. when the second poll is called, the first poll would be committed. however, if we poll every 5 seconds, and auto commit interval is 7 seconds, **the first poll would be committed when the third poll is called** +- for staying inside at least once semantics, because of what was described above, our processing should be synchronous - before we call poll the next time, our current batch should have been successfully processed, so that if by chance the next poll has to commit, it can be sure that we have already successfully processed our current batch. in auto commit, commitAsync is called +- we can disable auto committing as well, and instead manually commit offsets using `consumer.commitSync()` / `consumer.commitAsync()` +- the auto offset reset (`auto.offset.reset`) property defines how to consume from a topic if there is no initial offset i.e. a new consumer group has just started listening - the default is latest i.e. start consuming from the end of the partition. we can set it to earliest. my understanding - earliest corresponds to the `--from-beginning` flag in the cli for kafka console consumer +- we can also reset consumer offsets. internally, feels like this might be possible since it is as simple as adding a message to the __consumer_offsets topic, due to the cleanup policy being compact? (discussed later) +- consumers send a heartbeat every `heartbeat.interval.ms` seconds (3 seconds by default), and if no heartbeats are received for `session.timeout.ms` seconds (45 seconds by default), the consumer is considered dead. this heartbeat related functionality is carried out by the heartbeat thread +- if a new poll call is not made in `max.poll.interval.ms`, the consumer is considered to have failed processing of that message. my understanding - this is important because all offset commits are done by newer poll calls for the previous polls? so maybe this way, kafka can know that for some reason, message processing has been stuck or has failed, and it has to re send the message for processing? +- for replicating data across kafka clusters, e.g. if cluster is across regions, or for e.g. when we are hitting performance limits with one kafka cluster and need multiple kafka clusters, etc, we can use tools like mirror maker 2. replication can be active active (two way replication, e.g. data producers in multiple regions) or active passive (one way, e.g. for global resiliency) +- when we try to connect to kafka, kafka brokers have a setting called `advertise.listeners`. this way, when the client tries connecting to the kafka broker, the broker returns this value and the client instead tries connecting using this value if the value it initially tried connecting using was different. e.g. imagine client tries connecting using a public ip, but the value returned by the broker using `advertise.listeners` is the private ip address +- partition count - if we change the partition count suddenly, understand it would affect ordering of messages with same keys etc +- more partitions = more parallelism +- partitions should be usually 3 times the number of brokers, so 3 partitions per broker +- replication factor - if we change this, we increase load on our kafka custer, since there is more network calls etc involved for the replicas +- replication factor should be usually 3 +- [topic naming guide](https://cnr.sh/essays/how-paint-bike-shed-kafka-topic-naming-conventions) - `..`. for message type, all possible values are mentioned in the link, some common ones are `queuing` for classic use cases, `etl` for cdc, etc. dataset name is like database name and data name is like table name. also use snake case +- [debezium](https://github.com/debezium/debezium) uses kafka connectors and kafka ecosystem underneath, and helps do realtime cdc by using database's transaction logs +- so, two common patterns with kafka - + - use applications like spark, flink, (or even kafka itself) etc to read from kafka and generate realtime analytics + - use kafka connect to write to s3, hdfs, etc from kafka and generate batch analytics from this +- kafka metrics - monitor a lot of things like how many under replicated partitions exist i.e. how many partitions have issues with in sync replicas +- we can enable in flight encryption ssl, authentication and authorization +- kafka has data retention for 7 days by default +- but until then, everything is internally in file formats, e.g. i tried poking around in the log.dir folder on my local i.e. inside /tmp/kraft-combined-logs/ +- partitions are internally made up of segments +- so, there is one (the latest) active segment, and other segments can be consider obsolete +- a segment is closed means it is available for log cleanup - this helps delete obsolete data from the disk of kafka +- how to cleanup logs - there are two possible values for `cleanup.policy` on a topic - `compact` (default for __consumer_offsets) and `delete` (default for all user defined topics) +- a segment is closed and a new one is started when either the `log.segment.bytes` size is reached, or if `log.retention.hours` is reached +- if we set cleanup policy to be compact - a new segment is created, and only the values for the latest keys for a topic is retained, and others are discarded. so e.g. segment 1 has value a for key x and value b for key y, and segment 2 has value c for key y, the newly created segment would have value a for key x and value c for key y. this behavior also makes sense for the consumer offsets topic if i think about it +- for very large messages, either tweak configuration parameters to increase maximum limits, or better, use something like sqs extended client of aws is possible + +## RabbitMQ + +- messaging systems - + - used for application to application communication + - they are near realtime - messages can be processed by consumers instantly + - helps establish a standard - both producers and consumers would have to obey this messaging system specifications, instead of each source having integration logic for each target +- rabbitmq features - + - rabbitmq is open source + - multiple instances can be deployed into a cluster for high availability + - web interface for management and monitoring + - built in user access control + - built in rest apis (mostly for diagnostic purposes but can be used for messaging, not recommended) +- running rabbitmq - + ``` + docker container run -d -p 5672:5672 -p 15672:15672 rabbitmq:3.13.1-management + ``` +- publisher / producer - sends message on the exchange +- subscriber / consumer - consumes message from the queues +- queue - the buffer tht stores messages before the consumers consume from this queue +- exchange - routes messages to the right queue +- routing key - the exchange uses this parameter of the messages to decide how to route it to the queues +- binding - link between exchanges and queues +- message durability - guarantees that messages survive server restarts and failures +- by default, everything is "transient" i.e. lost on rabbitmq server restarts! +- to ensure message durability, we need to set two parameters - + - mark queues as durable - we need to set this when creating queues + - use persistent delivery mode when publishing messages. spring does this by default for us +- rabbitmq also has two types of queues - + - "classic" - the default. has good performance, but cannot withstand node failure, since it is only present on the primary node + - "quorum" - replicated across different servers. maintains consistency using quorum +- rabbitmq can store messages either in memory or on disk +- the "default exchange" is used if we do not specify the exchange and just specify the routing key + ```java + rabbitTemplate.convertAndSend("example.rabbitmq", "hello world"); + ``` +- some consumers - + ```java + @Component + @Slf4j + public class Consumer { + + @RabbitListener(queues = "example.rabbitmq") + public void consume(String message) { + log.info("consumed: [{}]", message); + } + } + ``` +- assume our producer is faster than the consumer. using below, 3 threads are created, one for each consumer. this way, our slow consumers can keep up with the fast producer, without us having spun up additional instances of the consumer + ```java + @RabbitListener(queues = "example.rabbitmq", concurrency = "3") + ``` +- spring rabbitmq uses jackson for serialization / deserialization of pojos +- a naming convention example - x.name for exchanges, q.name.sub_name for queues +- "fan out exchange" - implements the publish subscribe pattern - it broadcasts the message to all queues bound to it + - e.g. we have a direct exchange x.hr + - it has bindings for two queues - q.hr.marketing and q.hr.accounting + - when binding, the binding key can be empty + - similarly, when producing, the routing key can be empty + - now, any messages put on the exchange x.hr will flow to both the queues + - in the snippet below, we specify the exchange name. the routing key is ignored, hence it is set to an empty string + ```java + rabbitTemplate.convertAndSend("x.hr", "", employee); + ``` +- "direct exchange" - send messages to selective queues instead of broadcasting to all queues + - e.g. we have a direct exchange x.picture + - we have two queues - q.picture.image and q.picture.vector + - q.picture.image is bound using two binding keys to the exchange - png and jpg + - q.picture.vector is bound using one binding key to the exchange - svg + - now, when our routing key is png / jpg, it goes to the image queue + - when our routing key is svg, it goes to the vector queue + - so, exchange sends the message to queues where routing key = binding key + - note - if the routing key does not match any rule, the message would be discarded +- "topic exchange" - + - with direct exchange, we can only route messages using a single criteria - e.g. we only used image type above + - using topic exchange, we can route messages based on multiple criteria + - note about wildcards - + - `*` can substitute for 1 word + - `#` can substitute for 0 or more words + - e.g. we have a topic exchange x.picture + - we can send images to different queues based on image size, image type, source of image, etc + - the producer will just produce the messages using routing keys like source.size.type, e.g. mobile.large.png, desktop.small.svg and so on + - e.g. we have queues for different purposes. e.g. we want an image queue like earlier. we can have binding keys of the form either `#.png` and `#.jpg` or `*.*.png` and `*.*.jpg` + - this is true decoupling - the producer just tells the kind of messages being published, while the consumer selectively decides the messages it wants to receive based on the binding key + - similarly, if we need a consumer to consume messages for all large svg, we would use `*.large.svg` +- dead letter exchanges - + - in case of an error during consumption, spring will by default requeue the message + - we could be stuck in an infinite loop during this consumption and requeueing + - thus, we can use a "dead letter exchange" - the message after failure is forwarded to this dead letter exchange, which in turn forwards it to another queue depending on how we set the binding for this dead letter exchange + - then, from this special queue bound to the dead letter exchange, we can notify the consumers of the error + - configuring the dead letter exchange for a queue - just click on "Dead letter exchange ?" and enter the exchange name beside "x-dead-letter-exchange" + ![dead letter exchange](/assets/img/messaging-systems/dead-letter-exchange.png) + - note - we can change the routing key of the queue when moving a message to the dead letter exchange + - note - we cannot throw any exception for this too work - we need to throw `AmqpRejectAndDontRequeueException` +- time to live - + - if a message is present in a queue for longer than this timeout, it is declared "dead" + - the message from the actual queue would be moved into the dead letter exchange if configured after this timeout + - along with configuring dead letter exchange like we saw above, we can configure the queue with this ttl as well. it will then automatically move the messages to dead letter exchange in bot scenarios - timeouts and errors +- retry mechanism - + - some errors can be intermittent + - so, we might want to retry after x seconds for n times, before moving a message to dlq + - say we have three exchanges and three corresponding queues - work, wait and dead + - wait exchange is the dead letter exchange for work queue - when there is a failure in our consumer, the message is sent to wait exchange for "backoff" like functionality + - work exchange is the dead letter exchange for wait queue - when the message has been sat in wait queue for sometime, it is moved to work exchange for retrying + - finally, if our consumer notices that it has already tried reprocessing the message 3 times or so, it would move the message into the dead exchange which then goes into the dead queue + - we can get metadata around retires etc from rabbitmq headers +- retry mechanism in spring - + - on the above approach, there is a lot of manual code and configuration from our end + - using spring, we do not need all this logic - spring can automatically handle the retry and backoff for us, and it will move the failed messages to the dead letter exchange + - we only to ensure our queue has the right dead letter exchange configured on it + - apart from that, we can configure the retry logic (exponential backoff) like so - + ``` + spring.rabbitmq.listener.simple.retry.enabled=true + spring.rabbitmq.listener.simple.retry.initial-interval=3s + spring.rabbitmq.listener.simple.retry.max-interval=10s + spring.rabbitmq.listener.simple.retry.max-attempts=5 + spring.rabbitmq.listener.simple.retry.multiplier=2 + ``` + - retry at 3s, then 6s (refer multiplier), and remaining 2 retries at 10s gaps +- diff --git a/_posts/2023-08-19-hadoop.md b/_posts/2023-08-19-hadoop.md new file mode 100644 index 0000000..c685d9c --- /dev/null +++ b/_posts/2023-08-19-hadoop.md @@ -0,0 +1,1020 @@ +--- +title: Hadoop +--- + +## Introduction + +- big data is for terabytes or petabytes of data +- explosion of data - rate at which data is being generated is very high +- 3 vs of big data - + - data volume - as the resolution of camera has increased, so has the size of the media it generates + - data velocity - speed at which data is generated. earlier, batch jobs i.e. at a period were more common. the shift is towards near realtime / realtime now + - data variety - data used to just be in the form of tables, where rdbms systems worked great. now, we have unstructured data in the form of media etc as well i.e. variety of data has increased + - **structured data** - row column format in a table. e.g. rdbms + - **semi structured data** - well defined structure, but not necessarily structured in a tabular format, e.g. json, xml + - **unstructured data** - e.g. text files, audio, etc + - some new vs - veracity (trustworthiness of data, e.g. user input might not be as trustworthy?), value (should be able to drive business value) +- vertical scaling vs horizontal scaling - + - horizontal scaling is more scalable + - horizontal scaling is more available / fault tolerant + - horizontal scaling is more cost effective +- shared nothing - each processing has its own storage. relatively faster +- shared disk - each processing unit works on the same underlying architecture. time taken for data movement is high, since unlike in shared nothing where the storage can be local / closely located to the processing, it has to be located far away +- partitioning - file is broken down (partitioned) and stored in smaller parts in different nodes. also called distributed +- replication - the parts are stored in different nodes so that a node failure does not stop our processing. also called redundancy. number of total copies is determined by "replication factor" +- **4 points** - hadoop allows for horizontal scaling, follows shared nothing architecture, has partitioning, has replication +- seek time - time required for head to point to the right data on hard disk +- transfer rate - time required to move data from head of hard disk to ram +- hadoop is and isn't good at - **4 points** - + - processing large files - it is not for small files + - processing sequentially - it is not good for random access since there is no indexing like in rdbms + - handling unstructured data - it not for acid like / 3nf etc properties like in rdbms + - processing frequently changing data + +## Evolution + +- why do we need hadoop at all - e.g. when using rdbms, we can create indexes on our frequently used columns like name. however, when for e.g. google has to search its database by our search term, there is no such way easy of indexing. so, it would process our query on its data that is distributed in parallel +- so around 2004, google published a paper on gfs (google file system) and google map reduce +- in parallel around the same time, doug cutting was working on nutch +- yahoo hired doug and hadoop was created from nutch +- hadoop's hdfs = google's gfs and hadoop's map reduce = google's map reduce +- facebook launched hive, big query is google's equivalent of this +- with hive (and pig), we write sql to query or add data to hdfs, thus making writing complex operations much easier. this translates to map reduce underneath +- hbase - nosql database system on top of hdfs to store unstructured data. big table is google's equivalent of this. we store data in a denormalized format for better performance +- sqoop - data transfer (to and from) between database (mostly rdbms) and hdfs +- flume - streaming logs from distributed systems into hdfs +- [spark](/posts/spark) - complete package +- cloudera, hortonworks, etc. bundle different tools like hadoop together and distribute them + +## Hadoop Components + +- yarn - yet another resource negotiator. it is a "cluster manager". it is needed because recall how hadoop makes use of horizontal scaling, while abstracting away all the complexities underneath away from us. refer [hadoop 2.x architecture](#hadoop-2x) below for how "resource manager", "node manager" and "application master" work +- hdfs - stores large amounts of data in small chunks to allow processing them in parallel. refer [hdfs architecture](#hdfs-architecture) below for how "name node", "data node" etc work +- map reduce framework - we write simple map reduce programs discussed in this post. this is automatically run in a distributed fashion with the help of yarn, on distributed data with the help of hdfs. note - writing map reduce directly is not common, so tools like hive etc came into picture + +## Theory + +- my understanding - **hadoop = map reduce + hdfs + yarn** in todays world +- hadoop operating modes + - standalone - doesn't use hdfs and reads and writes directly to hard disk + - pseudo distributed - only one machine that can run both master and slave, uses hdfs + - distributed - minimum 4 nodes are needed, for production workloads +- map is run on all slave nodes, reduce is run to aggregate the results from all these slave nodes +- each machine is said to hold a split of the data +- the mapper function would be called once per split - so mappers of different splits would run in parallel +- for hadoop to work, each row of data should be processable independently and out of order +- the mapper outputs a key value pair +- while map is called for all rows, reduce is called once for each key, which is why the input of reduce contains an iterable +- one confusion i had cleared? - don't think of this map and reduce like in arrays (or even spark?). we are using `context.write`, so output of both map and reduce can contain as many elements as we want, just that map would be called once per data element, while reduce once per key along with all the values for that key. the data structure which allows multiple items for the same key is called multi bag +- so, in between the map and reduce, there is a shuffle that happens bts to help group results of map by key +- since a reduce can only run on one slave node at a time, all values for a key need to be first brought into one slave node during shuffle +- understand that output type of key / value of map = input type of key / value of reduce +- all the keys that go to a reducer are sorted by default +- number of mappers = number of splits of data. we cannot configure the number of mappers +- number of reducers by default is 1. in this case, outputs of all mappers are collected, sorted by key and then sent grouped by key to send one by one on a key wise basis to the reducer +- internally, after map process, each key is assigned to a partition +- number of partitions = number of reducers +- so, basically after map, the assigning of a partition to a key helps determine which reducer a key should go to +- the partition that an item should go to is determined based on its key - something like (consistent_hash(key) % number of partitions). so, items with the same key cannot go to different reducers +- while doing this, we should avoid skews / hot partitions +- after the partition is determined via partitioning, the shuffle phase helps get the output of map to the right partition +- finally, the items that arrive at a partition are sorted and then grouped by key, so that the reducer can get (key, iterable of values) +- remember that while the same key cannot go to different partitions, multiple keys can go to the same partition. this is why we need the sort + group operations +- we can hook into partitioning, sorting and grouping phase - helps achieve secondary sorting, joining, etc. discussed later + +## Combiners + +- to reduce the overhead of shuffle, we can add a combiner - this means before shuffling, first combine the outputs of map on a single node +- e.g. if for word count, instead of shuffling, we can first ensure we reduce at the slave node level. this way, a key would be present at most once in one slave node. this reduces the amount of data to shuffle +- we can use the same class for combiner and reducer if we want +- combine may or may not run. e.g. if hadoop feels the amount of data is too less, the combine operation might not run. so, following points are important - + - our combine operation should be optional i.e. we should be sure that even if our combine operation does not run, our results stay the same. e.g. we want to find out all the words that occur 200 or more times. we can only add the values for a key in a combiner. writing the word to the context based the condition that it occurs 200 or more times can only stay inside the reducer since at that point, the reducer has all the values. basically, it might happen that one worker's combine sees count as 150 for a particular word and another worker's combiner sees count as 60 for the same word + - input and output format of combine operation should be same so that it whether it runs or not makes no difference (and of course these types should also be the same as output of map and input of reduce) +- so, the entire process looks like this? - map -> combine -> partition -> shuffle -> sort -> group -> reduce + +## HDFS Commands + +- hdfs - hadoop distributed file system +- to list all folders and files in hdfs recursively - `hdfs dfs -ls -R /`. this command works with folders as well i.e. at the end, specify a custom path instead of / +- use `hdfs dfs -put first-speech.txt` to put a file into hadoop. it is placed in /user/shameek (inside hdfs) by default, else specify the custom path at the end of the command +- get a file from hdfs into local - `hdfs dfs -get first-speech.txt` +- read the output from hdfs directly instead of copying it to local first - `hdfs dfs -cat output/part-r-00000` +- change permissions - `hdfs dfs -chmod 777 ExamScores.csv` +- cp copy a file from one location to another inside hdfs - `hdfs dfs -cp ExamScores.csv ExamScores2023.csv` +- moving file from one location to another inside hdfs - `hdfs dfs -mv ExamScores.csv ExamScores2021.csv` +- browse the file system using gui - go to http://localhost:9870/ -> utilities -> browse the file system + +## HDFS Architecture + +- hdfs - hadoop distributed file system +- hdfs is used for terabytes and petabytes of data +- name node is a daemon running on master +- data nodes are daemons running on slave nodes +- name node maintains metadata e.g. which file is stored where. recall how file is stored in distributed mode, replicated mode, etc. these records are maintained in the form of metadata in the name node +- e.g. if we have a file of 300mb. we submit it to the name node, which would then break the file into **splits** of 128mb (default), so 128mb + 128mb + 44mb and stored in different slave nodes, so that they can be processed in parallel +- secondary name node and name node - secondary name node has something called "edit logs". to me, this feels like transaction logs in database i.e. all changes are continuously recorded in the edit logs of the secondary name node. the "fs image" is present on the name node, which is like the current snapshot of the system, e.g. the chunks of file described above is present in data node 1, data node 2 and data node 7. as changes happen continuously, e.g. we add / remove / modify files etc, the changes come up in the edit logs of the secondary name node. the secondary name node then periodically looks and then modifies the fs image of the name node to reflect the current state of the system +- hadoop 2.x onwards, a standby name node is present as well. so, hadoop 1.x has a single point of failure unlike hadoop 2.x +- hdfs client - gets the metadata from name node and accordingly requests data nodes for the data i.e. - + - hdfs client asks name node where the data is / tells name node it wants to store file x + - name node responds with how to store file / where the file is stored + - hdfs client then accordingly interacts with data nodes + ![hdfs architecture](/assets/img/hadoop/hdfs.drawio.png) +- my understanding - why the above breaking might be needed - e.g. if name node directly responded to the hdfs client by gathering data from data nodes, entire point of distributing data is lost +- because of this distributed nature of data, there is a checksum present on the name node metadata, and the hdfs client itself calculates the checksum from the data it gathers from the data nodes. these two checksums are compared to verify integrity of data
+- data nodes also send heartbeats to the name node periodically + +## Resource Management Architecture + +### Hadoop 1.x + +- job tracker - daemon located in master, this is where we submit the map reduce jobs via hdfs client +- the job tracker then breaks the job into multiple tasks and submits to task tracker +- task trackers run on the slave nodes. there can be multiple instances of task trackers running on a single slave node +- rack awareness - name node is rack aware i.e. for e.g. client is directed to the closest data node where the data might be present out of all the data nodes having the replicated data. (recall kafka had something similar) +- just like name node vs data node in hdfs, here, the job tracker is a daemon running on master node while the task tracker is a daemon running on slave nodes +- multiple slots can be present on a slave node - understand how a server can be multi core and therefore, perform multiple tasks at a time +- so, these slots are basically jvms which run slices of work + +![hadoop1.x](/assets/img/hadoop/hadoop1.x.drawio.png) + +### Issues + +- hadoop 2.x and 3.x are similar, just performance improvements +- hadoop 1.x vs hadoop 2.x - in hadoop 1.x, cluster resource management and data processing both is done by map reduce framework. in hadoop 2.x, cluster resource management has been delegated to yarn (yet another resource negotiator), while map reduce framework is only responsible for data processing. the underlying storage continues to be hdfs in both versions +- so, map reduce in hadoop 1.x = map reduce (data processing) + yarn (resource management) in hadoop 2.x + +### Hadoop 2.x + +- so now, map reduce is just used for data processing, while cluster resource management is done by yarn +- so, map reduce, spark, etc sit on top of yarn, while hive, pig, etc sit on top of map reduce +- it does things like - + - resource management + - assigning tasks to nodes that have sufficient resources + - rescheduling failed tasks to new nodes +- yarn has two components - resource manager and node manager +- resource manager runs on master +- resource manager has two components - resource scheduler and application manager +- resource scheduler does not deal with any logic around retrying etc, it just cares about assigning of resources (like ram, etc) based on availability +- application manager is responsible for spinning up application masters +- now, when we submit a job, the resource manager with the help of its two components, spins up an application master +- understand that application master is like another container, it is not like a daemon running on master node perennially. so, the application master is scoped to the lifecycle of the application +- now, the application master coordinates with resource scheduler of resource manager to spawn containers that can execute our map / reduce tasks +- containers execute our actual tasks +- a node can have multiple containers, just like in hadoop1.x, multiple slots could be present on a slave node +- the node manager sends heartbeats for health monitoring of node (recall how in hdfs, data nodes do the same thing) +- note - master node is also called as the controller node +- are all the components listed below - resource manager, node manager, resource scheduler, application master, application manager, container, basically components of yarn and map reduce? + +![hadoop2.x](/assets/img/hadoop/hadoop2.x.drawio.png) + +- location constraint - to avoid a lot of data transfer over the network, execute the tasks on the node which is the closest to the data +- so two different things? - location constraint - schedule work on node having data and rack awareness - if for e.g. there is replication, direct node to closest replica +- now, we know that there can be multiple containers being concurrently executed on this node until all its resources are not used up. if more jobs are spawned, the jobs would have to wait in a queue +- how these containers / tasks get scheduled on the node is determined by the scheduling policy - + - fifo scheduler - first in first out scheduler. e.g. assume a job takes 5 minutes, and uses up all the resources of this node. a new job that is submitted almost immediately after this job, and takes say 10 seconds, will still have to wait for the entire duration of 5 minutes till the first job is complete, since there are no resources available for this second job to execute + - capacity scheduler - divide all the resources into multiple parts, e.g. give 30% of the resources to promotional and remaining 70% to searching. this way, both these parts will individually act as fifo schedulers, but a short promotional workload will not be stalled by long running searching and indexing jobs. this is the default and usually preferred one. by default, only one queue is present - default, with 100% of the capacity + - fair scheduler - accept all jobs, and as more jobs come in / go out, allocate each of them equal amount of resources + +## Hadoop Streaming + +- a utility that helps write map reduce programs in non java languages like python, r, etc +- e.g. of using hadoop streaming on my local - `hadoop jar ~/hadoop-3.3.6/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar -files wordcount_mapper.py,wordcount_reducer.py -mapper wordcount_mapper.py -reducer wordcount_reducer.py -input wordcount_input -output output`. here wordcount_mapper and wordcount_reducer are just simple python programs. we read from the input file wordcount_input, mapper outputs to stdout which is then used as input for wordcount_reducer and finally the reducer's output is stored inside output/part-00000 +- wordcount_mapper.py - + ```py + #!/usr/bin/python3 + import sys + + for line in sys.stdin: # for all lines + words = line.split() # grab all words + for word in words: # for all words + print ('{0}\t{1}'.format(word, 1)) # output (word, 1) + ``` +- wordcount_reducer.py + ```py + #!/usr/bin/python3 + import sys + + prev_word = None + prev_count = 0 + word = None + + for line in sys.stdin: # for all (word, 1) + + line = line.strip() + word, count = line.split('\t') + count = int(count) + + if word == prev_word: + prev_count += count # add to previous word count + else: # if current word is not the same as last word + if prev_word: + print('{0}\t{1}'.format(prev_word, prev_count)) # print previous word + prev_word = word # update previous word + prev_count = count + + if prev_word == word: + print('{0}\t{1}'.format(prev_word, prev_count)) + ``` + +### mrjob + +- developed by yelp +- makes it much easier to write and work with map reduce in python - things like chaining jobs etc. become much easier +- we just write one file using clean coding principles unlike using two files like specified in hadoop streaming +- allows writing tests locally (i.e. without support around hdfs etc) +- even aws emr etc work with mrjob + +## WordCount Example + +- initial [pom.xml](https://gist.github.com/shameekagarwal/71f127eb24ffe9997c3488cdf8364313) +- run `mvn clean package` +- command to submit job - `~/hadoop-3.3.6/bin/hadoop jar ./target/hadoop-1.0-SNAPSHOT.jar org.example.One input output` +- visit status of job at http://localhost:8088/cluster/apps +- note - for most classes, i find there are two packages we can import from - mapred and mapreduce. we should try using mapreduce where possible +- a basic example for word count - + ```java + public class One { + + public static class MapClass extends Mapper { + + @Override + protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { + String body = value.toString().toLowerCase().replaceAll("[^a-z\\s]", ""); + String[] words = body.split(" "); + for (String word : words) { + if (word.length() >= 7) { + context.write(new Text(word), new LongWritable(1)); + } + } + } + } + + public static class Reduce extends Reducer { + + @Override + protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + long sum = 0L; + for (LongWritable longWritable : values) { + sum += longWritable.get(); + } + context.write(key, new LongWritable(sum)); + } + } + + public static void main(String[] args) throws Exception { + + Path in = new Path(args[0]); + Path out = new Path(args[1]); + + Configuration configuration = new Configuration(); + Job job = Job.getInstance(configuration); + + try { + FileSystem hdfs = FileSystem.get(configuration); + hdfs.delete(out, true); + } catch (Exception ignored) { + } + + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + + job.setMapperClass(MapClass.class); + job.setCombinerClass(Reduce.class); + job.setReducerClass(Reduce.class); + + job.setInputFormatClass(TextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + + FileInputFormat.setInputPaths(job, in); + FileOutputFormat.setOutputPath(job, out); + + job.setJarByClass(One.class); + job.submit(); + } + } + ``` +- the mapper and reducer classes that we extend are generics, where the types are for the key and value of input and output respectively +- we also recursively delete the output folder because if we rerun jobs without doing this, there is a failure around folder already exists +- the output format has files like part-r-00000, where r indicates that the output is due to a reduce operation and the last number is the partition id +- recall how by default, number of reducers is 1. to change the number of reducers, simply write `job.setNumReduceTasks(2)` +- e.g. in this case, i see two files in the output folder - part-r-00000 and part-r-00001 +- built in functions - e.g. map reduce ships with a `LongSumReducer` which we could have used here - sum for each key, where the value is long +- my confusion cleared - `setOutputKeyClass` and `setOutputValueClass` are used for reducer outputs, while `setMapOutputKeyClass` and `setMapOutputValueClass` are used for map outputs. i think there are some times when we do not need to include the map ones, but i think i might as well just include all of them every time tbh + +## Constructing Map Reduce Logic + +### Numeric Summary Metrics + +- e.g. imagine we have a list of rows, where each row has a subject name and score obtained by any student. we want to calculate the average score for each subject + + | subject | marks | + | ----------- | ----- | + | chemistry | 75 | + | mathematics | 81 | + | chemistry | 79 | + +- constructing map reduce logic - **since we want to group based on subject, output key of map should be subject**. the numerical statistic that we want to perform, e.g. average in this case, can be done inside the reducer +- so, remember - map's job is to output the right key, and reduce's job is to output the right value based on all the values available for a key +- our map would try to make a key for the subject name, and output the marks as the value +- our reduce would just run (sum of all values / size of list of values) +- if we use the combiner as the same function that was used for reducer - e.g. if one node had 55 and 65 for chemistry, and another node had 75 for chemistry, the right average would be 65, but in our case, the combiner would output be 60 on node 1 and 75 for node 2, thus making the reducer output to be 67.5 +- recall how output of map = input of combiner = output of combiner = input of reducer. so, we can instead output a tuple as the value from the map as (marks, 1). combiner can then output (sum of marks, size). this way, the reducer now receives a list of tuples, and it has to add the first value of tuples for the total and divide it by the sum of second values of the tuple for the final average +- if we want to use custom data types - for keys, we must implement the `WritableComparible` interface, while the data types used for values must implement the `Writable` interface +- we need to write implementation of things like serialization and deserialization. hadoop input and output classes have helpers for this, e.g. `readUTF` / `writeUTF` for strings, `readDouble` / `writeDouble` for doubles, etc + - remember to keep the order of serialization and deserialization to be the same + - remember to keep a no args constructor (used by hadoop internally) +- so, we would need an extra class to store the total marks and number of students with that marks, if we want to use combiners + ```java + public static class MapClass extends Mapper { + + @Override + protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { + String[] record = value.toString().split(","); + context.write(new Text(record[0]), new AverageWritable(Long.parseLong(record[1]), 1L)); + } + } + + public static class Combine extends Reducer { + + @Override + protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + long count = 0; + long score = 0; + for (AverageWritable value: values) { + score += value.getTotal(); + count += value.getNoOfRecords(); + } + context.write(key, new AverageWritable(score, count)); + } + } + + public static class Reduce extends Reducer { + + @Override + protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + long count = 0; + long totalScore = 0; + for (AverageWritable value: values) { + totalScore += value.getTotal(); + count += value.getNoOfRecords(); + } + context.write(key, new DoubleWritable((totalScore * 1.0) / count)); + } + } + ``` +- the custom data type AverageWritable looks like below - + ```java + @NoArgsConstructor + @AllArgsConstructor + @Data + public class AverageWritable implements Writable { + + private long total; + + private long noOfRecords; + + @Override + public void write(DataOutput out) throws IOException { + out.writeLong(total); + out.writeLong(noOfRecords); + } + + @Override + public void readFields(DataInput in) throws IOException { + total = in.readLong(); + noOfRecords = in.readLong(); + } + } + ``` + +### Filtering + +- e.g. if we want to filter the data based on a condition, we can perform the filtering in the map function, and the reduce can just be an identity function +- e.g. if we make the output key of the map function as null, all the items would be received by the reducer in one go and it can write out all the items at once +- notice the use of singleton for `NullWritable` to reduce memory used + ```java + public class MapClass extends Mapper { + + @Override + protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { + String row[] = value.toString().split(","); + if (row[2].equalsIgnoreCase("Books")) { + context.write(NullWritable.get(), value); + } + } + } + ``` +- we do not call `setReducerClass` so that the identity reducer can kick in. identity reducer = a reducer that will just call `context.write(key, value)` for all the values that it receives - + ```java + job.setMapperClass(MapClass.class); + ``` + +### Distinct Values + +- if we want the distinct values, e.g. something that works like the `distinct` clause in sql +- we have a file with a word in every new line, and we would like to find a list of all the distinct words +- we can again use null writable instead of outputting dummy values like 1 for performance +- map class - + ```java + public class MapClass extends Mapper { + + @Override + protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { + context.write(value, NullWritable.get()); + } + } + ``` +- understand how the reducer here is not exactly identity - it would output one value for a key, not multiple like in the above example of filtering. reducer / combiner - + ```java + public class Reduce extends Reducer { + + @Override + protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + context.write(key, NullWritable.get()); + } + } + ``` +- note - the output was in sorted order - recall why this happens due to the sorting after the shuffle process + +### Top N Records + +- e.g. each row has user id and their number of followers, and we want to show the top n users + + | user_id | followers | + |---------|-----------| + | 1 | 30 | + | 2 | 30000 | + | 3 | 20 | + | 5 | 50 | + | 6 | 6000 | + +- my understanding - solution 1 - output key as null for all rows, so one reducer gets all the rows. there is a bottleneck here, since we cannot have more than one reducer for top n records +- all mappers work on subsets of data +- e.g. we can get all mappers to find the top n of the data they are responsible for +- note - it can happen that the mappers output less than n if the data that they have is small +- for a mapper to output top n records, it can do so only after all records in the partition it is responsible for have been processed, because mappers are called once per record for all records in the split it is responsible for - `cleanup` +- note - we have written the user for ascending order - priority queue will have the user with the lowest number of followers at the top. so, we just try to ensure priority queue size doesn't go over three, and that incoming element just needs to be larger than that whats at the top of the priority queue (i.e. smallest in the priority queue) +- we use User as output of map, so we could have just implemented writable, but we implement writable comparable so that we can use its compare to function, used by priority queue - + ```java + @Data + @AllArgsConstructor + @NoArgsConstructor + public class User implements WritableComparable { + + private String userId; + + private Integer numberOfFollowers; + + @Override + public void write(DataOutput out) throws IOException { + out.writeUTF(userId); + out.writeInt(numberOfFollowers); + } + + @Override + public void readFields(DataInput in) throws IOException { + userId = in.readUTF(); + numberOfFollowers = in.readInt(); + } + + @Override + public int compareTo(User o) { + return numberOfFollowers - o.getNumberOfFollowers(); + } + } + ``` +- map - + ```java + @Slf4j + public class MapClass extends Mapper { + + private final PriorityQueue pq = new PriorityQueue<>(); + + @Override + protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { + String[] row = value.toString().split("\t"); + User user = new User(row[0], Integer.parseInt(row[1])); + + if (pq.size() < 3 || pq.peek().getNumberOfFollowers() < user.getNumberOfFollowers()) pq.add(user); + if (pq.size() > 3) pq.poll(); + + log.info("pq is [{}], user is [{}]", pq, user); + } + + @Override + protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { + while (!pq.isEmpty()) { + log.info("writing user [{}]", pq.peek()); + context.write(NullWritable.get(), pq.poll()); + } + } + } + ``` +- **in mapper** - above, we used cleanup of mapper. this technique is called in mapper. it is an alternative to, and sometimes more optimal than combiners + - in case of combiner, the mapper would write to files, then the combiner would read from and again write to the files + - in case of in mapper, we do everything in memory using for e.g. priority queue here. so while there is memory overhead, it is more optimal from performance pov +- lets say for all these n values, the mappers output the same key, say null +- now, all map outputs can come into the same list into a reducer this way +- so, the reducer basically receives the combination of top n outputs of all mappers +- note - for this to work, we had to use a single reducer +- here cleanup is not needed like in map, since reducer itself will get all the values +- note - a weird thing i have experienced here - `pq.add(value)` changes everything in priority queue to whats added the last time to the priority queue - like a pass by reference vs value thing, but why? however, cloning the user i.e. `pq.add(new User(value.getUserId(), value.getNumberOfFollowers()));` fixed the issue + ```java + @Slf4j + public class Reduce extends Reducer { + + @Override + protected void reduce(NullWritable key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + + PriorityQueue pq = new PriorityQueue<>(); + + for (User value : values) { + if (pq.size() < 3 || pq.peek().getNumberOfFollowers() < value.getNumberOfFollowers()) { + pq.add(new User(value.getUserId(), value.getNumberOfFollowers())); + } + if (pq.size() > 3) pq.poll(); + log.info("pq is [{}], user is [{}]", pq, value); + } + + while (!pq.isEmpty()) { + log.info("writing user [{}]", pq.peek()); + context.write(NullWritable.get(), pq.poll()); + } + } + } + ``` +- so, the obvious bottleneck is that we are limited to using just one reducer +- we know that one reducer receives all the keys that it is responsible for in sorted order +- however, this order breaks across reducers - e.g. reducer 1 receives (a,5), (d,6), (w,5), while reducer 2 receives (b,2), (c,5), (e,7). the output from the two reducers are sorted at an individual level, but this order breaks when combined +- with "total order partitioning" (not discussed here), the idea is that the reducer 1 receives (a,5), (b,2), (c,5), while reducer 2 receives (d,6), (e,7), (w,5), i.e. we are ensuring keys received across reducers are ordered as well +- if we implement a custom partitioner, a naive way would be send letters a-j to partition 1, k-r to partition 2 and s-z to partition 3. while this does ensure even distribution in terms of the number of keys, this can mean uneven distribution since there can be hot keys. all of this is handled by the total order partitioner + +### Indexes + +- search engines periodically visit websites and store the text in their own database - they create an index +- web pages are crawled repeatedly for all the data to build an index and keep it updated +- then, when a user initiates a search, these engines search through their own index instead of going to the websites +- inverted indexing - search engines generate an index based on the contents of the websites. e.g. mango is contained in files 1 and 3, war in files 1 and 5 and so on. the input was just files, while the output has the key as word, the value as the files containing this word. this structure is called an inverted index +- analogy behind inverted index - website themselves are an index - we type in a url and get back the content. the key is the url and the value the content. however, we generate an inverted index by using content as keys and the urls as values, so that for a search term, we know what urls may contain relevant information to it +- tf - term frequency - number of times a word appears in a document / total number of words in the document. e.g. if mango appears 5 times in a document with 1000 words, tf = 0.005 +- while calculating the tf, all words are considered equally important, so to help scale the rare words up, we use idf i.e. rare words across documents are bumped up +- idf - inverse document frequency - log (total number of documents / number of documents having the word). e.g. if 1,000 files have the word we are searching for out of 1,000,000, idf = 3 +- so, we would want the value of tf * idf to be high for our website to come up on the top +- so, all these calculations around building indexes from huge amounts of raw data (websites) very fast using distributed processing is what big data helps with +- a simple way of achieving this - we know that our output should contain the word as key and list of urls containing it as output. so, the map should output for all words on the page, that word as the key the url as value. now, the reducer receives all the urls for a word + +## File Formats + +- file formats - used when we wrote the following bit of code - + ```java + job.setInputFormatClass(TextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + ``` +- the different possible options for input formats are - + - `TextInputFormat` - file contains only values. key is line number, which is why we were using `LongWritable` for key of map till now everywhere + - `KeyValueTextInputFormat` - used when input file contains key as well + - `SequenceFileInputFormat` - uses compression, useful when we chain map and reduce jobs i.e. input of the second job is the output from the first job + - `NLineInputFormat` - recall how by default file is split into segments of 128mb each. this way, for e.g. if we have 6 slave nodes and only a 500mb file, we cannot use all our slave nodes properly. this is where this option is useful, whereby we can specify the number of lines that should go into per split, thus helping us utilize our cluster more effectively +- the different possible options for output formats are - + - `TextOutputFormat` - each line has the key and value separated by a tab + - `SequenceFileOutputFormat` - uses compression, useful when we chain map and reduce jobs +- so for e.g. for the exams example discussed in the section before, the format of a line was for e.g. Chemistry,79. so, we can use the `KeyValueTextInputFormat` class for it as follows i.e. note how the map doesn't have to extract the key by using split on the value like earlier. note - specify the separator as well, since tab is the default - + ```java + public static class MapClass extends Mapper { + + @Override + protected void map(Text key, Text value, Mapper.Context context) throws IOException, InterruptedException { + context.write(key, new AverageWritable(Long.parseLong(value.toString()), 1L)); + } + } + + // ... + configuration.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ","); + job.setInputFormatClass(KeyValueTextInputFormat.class); + ``` + +## Chaining Jobs + +- e.g. imagine we have data in the following format i.e. each row has marks obtained for a student - the school that student is from and the subject. for all subjects, we would like to obtain the school with the highest average, and the actual average + + | school | subject | marks | + | ------------------ | --------- | ----- | + | Bigtown Academy | Chemistry | 44 | + | Bigtown Academy | French | 69 | + | Mediumtown College | Biology | 61 | + | Largetown School | French | 67 | + +- so, we can break the problem as follows into two separate map reduce jobs - +- first job's map output - key = (school, subject), value = (marks, 1) (recall the value is this strange tuple because of the constraint when using combiners around types) + ```java + public static class MapClass extends Mapper { + + @Override + protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { + String[] record = value.toString().split(","); + ExamScoresV2KeyWritable newKey = new ExamScoresV2KeyWritable(record[0], record[1]); + AverageWritable averageWritable = new AverageWritable(Long.parseLong(record[2]), 1L); + context.write(newKey, averageWritable); + } + } + ``` +- first job's combiner output - key = (school, subject), value = (sum of marks, total students) + ```java + public static class Combine extends Reducer { + + @Override + protected void reduce(ExamScoresV2KeyWritable key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + long count = 0; + long score = 0; + for (AverageWritable value: values) { + score += value.getTotal(); + count += value.getNoOfRecords(); + } + context.write(key, new AverageWritable(score, count)); + } + } + ``` +- first job's reducer output - key = (school, subject), value = average of the school in the subject + ```java + public static class Reduce extends Reducer { + + @Override + protected void reduce(ExamScoresV2KeyWritable key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + long count = 0; + long score = 0; + for (AverageWritable value: values) { + score += value.getTotal(); + count += value.getNoOfRecords(); + } + context.write(key, new DoubleWritable(score * 1.0 / count)); + } + } + ``` +- second job's map output - key = subject, value = (school, its average for that subject). however, notice how it can read directly the key from the output of the earlier job, so we can also set the input format on the job directly as `job.setInputFormatClass(KeyValueTextInputFormat.class)` + ```java + public static class MapClass extends Mapper { + + @Override + protected void map(Text key, Text value, Mapper.Context context) throws IOException, InterruptedException { + String[] record = key.toString().split(","); + context.write(new Text(record[1]), new SchoolAverageWritable(record[0], Double.parseDouble(value.toString()))); + } + } + ``` +- second job's combiner output - key = subject, value = (school with maximum average for the subject, the average) + ```java + public static class Reduce extends Reducer { + + @Override + protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + SchoolAverageWritable max = new SchoolAverageWritable(null, -1); + for (SchoolAverageWritable value: values) { + max = max.getAverage() > value.getAverage() ? max : new SchoolAverageWritable(value.getSchool(), value.getAverage()); + } + context.write(key, max); + } + } + ``` +- second job's reducer output - _same as above_ +- so, the entire thing has been broken down into two jobs, which can be run one after another +- while we can run manually, hadoop can help achieve this via code using "job control" + ```java + Configuration configurationOne = new Configuration(); + Configuration configurationTwo = new Configuration(); + + ControlledJob controlledJobOne = new ControlledJob(configurationOne); + ControlledJob controlledJobTwo = new ControlledJob(configurationTwo); + + // notice how input of second job = output of first job + // these static calls of getJob do stuff like setting types on job, + // setting inputs and outputs, setting mappers, calling jarByClass, etc + // all of which we have seen earlier + Job jobOne = FiveJobOne.getJob(configurationOne, new Path(args[0]), new Path(args[1])); + Job jobTwo = FiveJobTwo.getJob(configurationTwo, new Path(args[1]), new Path(args[2])); + + controlledJobOne.setJob(jobOne); + controlledJobTwo.setJob(jobTwo); + + // adding dependency + controlledJobTwo.addDependingJob(controlledJobOne); + + JobControl jobControl = new JobControl("SchoolWithHighestAverage"); + jobControl.addJob(controlledJobOne); + jobControl.addJob(controlledJobTwo); + + // some thread stuff we have to do + // when running controlled jobs + Thread thread = new Thread(jobControl); + thread.setDaemon(true); + thread.start(); + + while (!jobControl.allFinished()) { + Thread.sleep(500); + } + ``` +- now recall how if chaining jobs, we can make use of compression - we can notice this if we try to run `cat` on the intermediate outputs (i.e. what we specify using `arg[1]` above) + ```java + // inside job 1 - + job.setOutputFormatClass(SequenceFileOutputFormat.class); + + // inside job 2 - + job.setInputFormatClass(SequenceFileInputFormat.class); + ``` +- the caveat of the above is output format of key / value of reduce of first job = input format of key / value of map of second job, which was not really needed otherwise if not using compression i.e. we could write double from reduce, and while reading read as string and parse this string into double +- my doubt - since max is needed, could we have used secondary sorting? is secondary sorting usually more optimal for finding maximum? + +## Pre and Post Processing + +- pre and post processing - to perform some steps after and before the job +- these pre and post processing steps work just like map tasks +- so the effective structure of hadoop can be said to be as follows + - multiple maps in the form of pre processing + - the actual map + - an optional combiner + - the shuffle step done by hadoop internally which then helps run reduce + - the reduce on a per key basis + - multiple maps in the form of post processing +- so, the structure when using pre and post processing looks like follows i.e. this replaces the `job.setMapper` etc calls - (the 4 types in between are for input key class, input value class, output key class and output value class). note - i think for adding combiner however, like stated below, i had to go back to `job.setCombinerClass` + ```java + // pre processing + ChainMapper.addMapper(job, PreProcessing.class, Text.class, Text.class, Text.class, Text.class, confOne); + + // the actual map, but syntax is same + ChainMapper.addMapper(job, MapClass.class, Text.class, Text.class, Text.class, AverageWritable.class, confTwo); + + // combiner + job.setCombinerClass(Combine.class); + + // reducer (note how it is setReducer and not addReducer like addMapper, since only one reducer can be used) + ChainReducer.setReducer(job, Reduce.class, Text.class, AverageWritable.class, Text.class, DoubleWritable.class, confTwo); + + // post processing + ChainReducer.addMapper(job, PostProcessing.class, Text.class, DoubleWritable.class, Text.class, DoubleWritable.class, confTwo); + ``` + +## Optimization + +- optimizing disk io in hadoop - in hadoop, the file is read from / written to disk at each step + - reduce size using pre processing - e.g. drop extraneous data + - use sequence file formats + - optimize the file itself before sending it to hadoop - e.g. xml would be much worse to process due to extra lines of tags compared to something like csv +- optimizing network io - this happens during shuffle + - add a combiner + - order input data using keys beforehand so that there is less network required during shuffling +- optimizing processing - this is more "code based" + - if we have to create something like `new LongWritable(1)` in the map class for e.g. in word count, we can instead create it at the global class level and reference it in the map task. this way, we don't create a new object every time, thus saving up on time for creation of these objects and more importantly garbage collection time + - use string builders instead of string if strings change frequently + - there is some time spent in instantiating a jvm. a new jvm is created for each task in a job by default, e.g. imagine a chain of mappers initially when using pre processing. however, we can reuse jvm across these tasks. we should observe how garbage collection works after this optimization. `conf.set("mapreduce.job.jvm.tasks", "10")`. 10 means reuse jvm for 10 tasks, 1 is the default i.e. 1 jvm per task and setting it to -1 means use one jvm for all tasks. note - this jvm reuse can only happen in a job, not across jobs + - recall why and how n line input format can be useful + - null writable - when we are just interested in the key (e.g. find the most frequently occurring words), and not the value, instead of using a dummy value like `new Text("")`, we can instead use `NullWritable.get()`, and notice how this is using singleton pattern, thus matching the first point of this section optimizing processing +- logging - this can be useful for for e.g. pseudo distributed, in standalone i can see the logs directly in the console as well. to view the logs, go to http://localhost:8088/cluster -> tools -> local logs -> userLogs. this will have a link to all job logs. go to the last job we ran -> and now this will have logs for all containers. i was just using lombok's `@Slf4j` and could automatically see the logs properly without any extra configuration +- hadoop also shows something called counters in ui, and this can be very useful for the health of job. we can add custom counters to it. we simply need to do is as follows (note - we have to use an enum i think) + ```java + enum RemovedRows { + LOW_SCORES, INVALID_DATA + } + + context.getCounter(RemovedRows.LOW_SCORES).increment(1); + ``` +- relational databases - we usually deal with files in hadoop because relational databases cant cope with massive amounts of data. yet we can read from / write to (preferable because this data is usually much smaller than input) relational databases +- when reading from database, each map task (remember how in production we will have multiple slave nodes etc) will initiate a read from the database. this can overload the database with jdbc connections (db proxy is the solution here?) + +## Unit Testing + +- mrunit - unit testing out map reduce code +- ps - this did not work for me, basically mrunit was relying on mapred versions and not mapreduce? however, written the code snippet below for reference +- adding the dependency - (note - i had to add the classifier for this to work) - + ```xml + + org.apache.mrunit + mrunit + 1.1.0 + test + hadoop2 + + ``` +- `MapDriver`, `ReduceDriver`, `MapReduceDriver` - it is as simple as us specifying the class we used e.g. for mapping we use MapClass, then specify the input and the expected output, and call runTest on these drivers to perform the assertion + ```java + public class TwoTest { + + MapDriver mapDriver; + + @Before + public void setUp() throws Exception { + mapDriver = MapDriver.newMapDriver(new Two.MapClass()); + } + + @Test + public void test() throws IOException { + mapDriver.addInput(new Text("chemistry"), new Text("79")); + mapDriver.addInput(new Text("chemistry"), new Text("91")); + mapDriver.addInput(new Text("mathematics"), new Text("67")); + + mapDriver.addOutput(new Text("chemistry"), new AverageWritable(79, 1)); + mapDriver.addOutput(new Text("chemistry"), new AverageWritable(91, 1)); + mapDriver.addOutput(new Text("mathematics"), new AverageWritable(67, 1)); + + mapDriver.runTest(); + } + } + ``` + +## Secondary Sorting + +- each node can have multiple partitions (which are recall 128 mb in size) +- now, for reduce to work, values for a key need to go to the same partition +- because of the way the shuffle process works, the values for a key in the reduce process come in random order +- now, imagine we want the values for a key to be in sorted order as well to for e.g. find the maximum +- one way can be we simply find the maximum by looping over all elements (`O(n)`), since we already have all the values for that key - inefficient +- so, we do something called secondary sorting +- now, we would like to ensure that the reducer gets the iterable of values in sorted order. so, here is how we can achieve it - + - construct a key where key = (actual_key, value) in the map process + - write a custom partitioner so that the partition is determined only using the actual_key part of the key (`Partitioner#getPartition`) + - ensure sort takes into account the key as is, so both (actual_key, value) are used (`WritableComparable#compareTo` i.e. present inside our custom key class) + - ensure group takes into account only the actual_key part of the key (`WritableComparator#compare`) +- so, example of an implementation of secondary sorting - imagine we have a csv, where each row has the subject name and the marks obtained by a particular student in it. we want highest score for each subject. so, we need to sort by both subject and marks, but use only subject for partitioning and grouping. so, the key would be a tuple of (subject, marks). we can also have a combiner that works just like the reducer, except that it needs to input and output the same tuple of (subject, maximum marks) (maximum needs to consider data across all nodes, but maximum from every node is sufficient for evaluating this) +- custom key - also determines how to sort, which uses first subject and then score (descending). so, items for a specific key (subject) are sorted by values (marks in descending) + ```java + @Data + @AllArgsConstructor + @NoArgsConstructor + public class ExamSubjectAndScoreKey implements WritableComparable { + + private String subject; + + private Integer score; + + @Override + public int compareTo(ExamSubjectAndScoreKey o) { + int result = subject.compareTo(o.subject); + return result == 0 ? o.score - score : result; + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeUTF(subject); + out.writeInt(score); + } + + @Override + public void readFields(DataInput in) throws IOException { + subject = in.readUTF(); + score = in.readInt(); + } + } + ``` +- grouping comparator, group using subject only. note - we have to add the constructor with call to super, otherwise we get a npe + ```java + public class SubjectComparator extends WritableComparator { + + public SubjectComparator() { + super(ExamSubjectAndScoreKey.class, true); + } + + @Override + public int compare(WritableComparable a, WritableComparable b) { + ExamSubjectAndScoreKey keyA = (ExamSubjectAndScoreKey) a; + ExamSubjectAndScoreKey keyB = (ExamSubjectAndScoreKey) b; + return keyA.getSubject().compareTo(keyB.getSubject()); + } + } + ``` +- partitioner, partition using subject only - + ```java + public class SubjectPartitioner extends Partitioner { + + @Override + public int getPartition(ExamSubjectAndScoreKey key, IntWritable score, int numPartitions) { + return key.getSubject().hashCode() % numPartitions; + } + } + ``` +- configure both partitioner and grouping comparator using - + ```java + job.setPartitionerClass(SubjectPartitioner.class); + job.setGroupingComparatorClass(SubjectComparator.class); + ``` +- map, combine and reduce - note how reduce and combiner are the same apart from the output format. my doubt - i thought secondary sorting only helps with reducer values being sorted i.e. how can we use `values.iterator.next()` for combiner? + ```java + public static class MapClass extends Mapper { + + @Override + protected void map(Text key, Text value, Mapper.Context context) throws IOException, InterruptedException { + int score = Integer.parseInt(value.toString()); + context.write(new ExamSubjectAndScoreKey(key.toString(), score), new IntWritable(score)); + } + } + + public static class Combine extends Reducer { + + @Override + protected void reduce(ExamSubjectAndScoreKey key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + context.write(key, values.iterator().next()); + } + } + + public static class Reduce extends Reducer { + + @Override + protected void reduce(ExamSubjectAndScoreKey key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + context.write(new Text(key.getSubject()), values.iterator().next()); + } + } + ``` + +## Joining + +- imagine doing this using what we know up till now +- e.g. we have a csv where each row represents an order for a customer. so, it contains customer id, date and total + + | customer id | total | + | ----------- | ------ | + | 18 | 233.28 | + | 17 | 27.35 | + | 18 | 202.23 | + +- another csv contains a row per customer, where each row has the customer id and country of origin + + | customer id | country | + | ----------- | ------- | + | 1 | France | + | 2 | Russia | + | 3 | Germany | + +- now, we would like to find totals by countries - so, we would use joins +- job 1 - map first csv into (customer id, [total, *null*]), identity reducer +- job 2 - map second csv into (customer id, [*0*, country]), identity reducer +- job 3 - the two above outputs can be combined since they have the same format, e.g. recall how we can specify not just a file but folder as well when running hadoop jobs, and the folder here would contain the outputs from both jobs above. now, we use an identity mapper, and then perform a reduce to get (country, total) for every customer. basically, in the iterable, there would be multiple values where country is null, and just one value where the country is not null but the total is 0. understand that the reducer of this job is called once for every key i.e. customer. we don't want to output one row per customer, but one row per country - so, we need yet another job's reduce to help us do some grouping +- job 4 - identity mapper, reduce can now just sum the totals, as the key is now country +- using secondary sorting - we would tag data from country csv with 1 and data from sales csv with 2. map would read from both files. now, we would perform secondary sorting logic - this way, we would have a dataset where the first row has key = customer_id, 1 for the country data, and following rows have key = customer_id, 2 for the sales data. we can group keys with multiple values under same reducer due to secondary sorting logic, this would output country, sum_of sales. so, the output of this first job is basically for each customer, there is a row, where the key is country and the value is total amount of sales for this customer. so, we can follow this up with a second job that has an identity mapper and a reducer to calculate the total +- so, this trick around secondary sorting basically helped us eliminate jobs 1 to 3 +- we can tag the two datasets in the configuration as follows + ```java + Path in1 = new Path(args[0]); + Path in2 = new Path(args[1]); + Path out = new Path(args[2]); + + Configuration configuration = new Configuration(); + configuration.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ","); + + configuration.set(in1.getName(), "1"); + configuration.set(in2.getName(), "2"); + ``` +- the mapper can be written as follows. **note how it extracts the tag and creates the new key using it, so that it can be used during the secondary sorting phase** - + ```java + public static class MapClass extends Mapper { + + @Override + protected void map(Text key, Text value, Mapper.Context context) throws IOException, InterruptedException { + FileSplit fileSplit = (FileSplit) context.getInputSplit(); + Integer tag = Integer.parseInt(context.getConfiguration().get(fileSplit.getPath().getName())); + Integer customerId = Integer.parseInt(key.toString()); + context.write(new CustomerAndTagKey(customerId, tag), value); + } + } + ``` +- each mapper is responsible for a split of the data, and that file split's name is used to tag the different files to help determine what table they belong to +- the reducer can now be certain that the first row would represent the country - + ```java + public static class Reduce extends Reducer { + + @Override + protected void reduce(CustomerAndTagKey key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + Iterator values$ = values.iterator(); + String country = values$.next().toString(); + double total = 0; + while (values$.hasNext()) { + total += Double.parseDouble(values$.next().toString()); + } + context.write(new Text(country), new DoubleWritable(total)); + } + } + ``` +- `CustomerTagKey#compareTo` - use both customer id and the tag. ensure that in the iterable received for a customer, first record contains the country, and remaining contain the totals for that customer + ```java + @Override + public int compareTo(CustomerAndTagKey o) { + return customerId.equals(o.customerId) ? tag - o.tag : customerId - o.customerId; + } + ``` +- `CustomerPartitioner#getPartition` - only use the customer id for determining the partition + ```java + @Override + public int getPartition(CustomerAndTagKey customerAndTagKey, Text text, int numPartitions) { + return customerAndTagKey.getCustomerId().hashCode() % numPartitions; + } + ``` +- `CustomerComparator#compare` - only use the customer id to group + ```java + @Override + public int compare(WritableComparable a, WritableComparable b) { + CustomerAndTagKey keyA = (CustomerAndTagKey) a; + CustomerAndTagKey keyB = (CustomerAndTagKey) b; + return keyA.getCustomerId().compareTo(keyB.getCustomerId()); + } + ``` +- now, we need to chain another job for actually totaling across customers, already discussed +- apparently what we discussed till now is called a reduce side join, there is another type called map side join, which can be more performant in some cases, but has limitations (same as [spark's](/posts/spark) broadcast join?) - + - reduce side join - tagging datasets, so reducer gets an iterable, which has a value for each row from both datasets which are a part of the join + - map side joins - one dataset is small enough to fit in a jvm, and the join is done in map side and not in reduce side +- e.g. of map side join - we have a handful of stop words to flag in some analysis for our use case, which can easily fit in a jvm diff --git a/_posts/2023-09-24-docker-and-kubernetes.md b/_posts/2023-09-24-docker-and-kubernetes.md new file mode 100644 index 0000000..b9d2280 --- /dev/null +++ b/_posts/2023-09-24-docker-and-kubernetes.md @@ -0,0 +1,1165 @@ +--- +title: Docker and Kubernetes +--- + +## About Docker + +- docker is a tool for managing containers +- container is a package of our code along with the dependencies and libraries to run that code +- docker follows a client server architecture + - we issue commands via cli to the docker client + - all tasks like creating containers, pulling images, etc. is done by docker daemon (dockerd) +- docker can be run natively in linux, so for macOS and windows, a virtualization layer is needed +- docker engine - dockerd, docker client +- docker desktop - docker engine, docker cli, kubernetes, docker compose, etc + +## Why use Docker + +- the same piece of code will always yield the same application i.e. doesn't rely on host environment +- having similar development, staging and production environments +- easily manage different projects running different versions of dependencies +- easily switch between versions of dependencies +- virtual machines are not as easily reproducible as containers since they have their own dedicated OS +- sharing and distributing is very convenient using Dockerfile, image, etc + +![docker vs vm](/assets/img/docker-and-kubernetes/docker-vs-vm.drawio.png) + +## Images and Containers + +- images are templates for containers, and a container is a running instance of an image +- containers are lightweight, isolated and run independent of each other +- we can use official prebuilt images, the most common source is [docker hub](https://hub.docker.com) +- note: while issuing docker commands + - container name and container id can be used interchangeably, same for image + - first few characters of the image_id are enough to reference the image if they can uniquely identify it +- `docker container run image_name` to create a container from an image +- if the image is not available locally, it is downloaded from dockerhub by docker +- `docker container ls` to list all running containers + - `docker container ls -a` to list all running as well as stopped containers +- Dockerfile is a special file name, as it is the default file docker looks for when we build an image +- Dockerfile contains the instructions for creating our own image +- example of a Dockerfile + ```Dockerfile + FROM node:14-alpine + WORKDIR /app + COPY . . + RUN npm install + EXPOSE 80 + CMD npm run start + ``` +- **all commands except the last instruction `CMD` are used to build the image, `CMD` is used to run the container** +- so basically `CMD` is used for `docker container run...` +- `EXPOSE` is only for documentation purpose +- `docker image build .` - used to build an image using the Dockerfile, `.` here is the build context + - `-t` flag to specify an image tag +- images have layers i.e. docker caches result after every instruction in the Dockerfile +- this means docker can reuse layers if possible - e.g. two different react applications use the same base image - node layer +- so, to optimize i.e. make building of images faster, in the Dockerfile example shown earlier, we can first install dependencies and then copy the source code, as rebuilding of image will be triggered more frequently by a change in the source code than it will be by a change in the dependencies + ```Dockerfile + FROM node:14-alpine + WORKDIR /app + COPY package.json . + RUN npm install + COPY . . + EXPOSE 80 + CMD npm run start + ``` +- `docker container start container_id` - start a stopped container +- we can reattach our terminal to the container using `docker container attach container_id` +- we can view logs using `docker container logs container_id` + - add `-f` flag for following the logs +- flags for `docker container run` - + - `-it` can be used to enter interactive mode + - `--rm` flag to delete the container when we stop it + - `--name` to specify the name of container + - `-d` to run in detached mode i.e. to not block our current terminal and run the container in foreground + - `-p` flag means publish, i.e. map host port to a container port +- `docker image ls` - lists the downloaded images +- `docker image rm image_id` - remove the image with id image_id +- `docker container stop container_id` - stop the container +- `docker container prune` to delete all stopped containers +- to get more information on images and containers, use `docker container inspect container_id` and `docker image inspect image_id` +- `docker container cp host_folder container_id:folder` to copy folder from the host to the container + - we can also reverse the order of arguments to copy folders and files from the container to the host +- we can share images, by sharing the Dockerfile or by hosting it on an image registry like docker hub +- `docker image push image_name:tag` to push images to the registry +- `docker image pull image_name:tag` to pull images from the registry +- we can also tag images using `docker image tag new_image_name old_image_name` +- a full example of running a container - `docker container run -d -p 3000:80 --name=backend --rm backend` +- `docker login` to login to docker hub +- note: i had to generate and use a personal access token in the docker hub ui and use that instead of the docker hub password in the cli +- `docker image rm -f $(docker image ls -a -q)` - deletes all locally downloaded images + - `-q` helps list only image ids + - `-a` helps list intermediate images as well + - `-f` force removal, e.g. if image is referenced by another image +- we can use a file `.dockerignore` to prevent copying files when using command `COPY` inside the Dockerfile e.g. + ``` + node_modules + Dockerfile + .git + ``` + +## Tags + +- **an image tag has two parts - the name / repository of the image and the tag** +- tag is like a version, so we can generate different versions of our image +- the default tag if not specified is latest +- why tags are important - + - rollback to previous versions in the production environment if newer versions have a bug + - newer versions of other images which are used by our images might have breaking changes in future +- suppose we always push and pull using tag latest. when we run `docker container run...`, it looks for the image locally and if it doesn't find it, it goes online to fetch it. but it will find the image with the tag latest, and docker doesn't understand that someone else has pushed a newer version online + +## Layered Architecture + +- all the docker related data like images, containers, etc. can be seen in /var/lib/docker +- the docker image we build contains of layers, and these layers are shared across various images +- e.g. if two images use the same base image, the layer of the base image is shared +- the image layers are read only +- when we create a container, a new layer is created on top of the existing layers of the image +- thus, all writes that we perform during runtime, log files, etc. get written onto this layer +- the persistence during the container's lifetime happens through this writable layer +- this mechanism is called copy on write, and the changes we made are lost unless we use volumes + +## Volumes + +- containers should be stateless as they can be easily created and destroyed, scaled up and down +- we can have data that we want to persist even if containers are killed +- this data shouldn't be stored inside containers, or we may lose that data +- volumes - mapping a persistent storage to docker containers. the persistent storage can be cloud storage, e.g. s3 of aws or our host directory system +- this way, every time a container tries to persist changes, they go to the persistent storage and don't get lost irrespective of how many times the container is started or stopped +- volumes can be of three types - + - anonymous volumes + - named volumes + - bind mounts +- `docker volume ls` shows all volumes +- anonymous volumes are managed by docker +- the **reference** to anonymous volumes are lost after the container shuts down +- if we use `--rm` flag while running the container, the anonymous volume is deleted as well +- we can create anonymous volume by using `VOLUME ./feedback` inside the Dockerfile +- we can also create anonymous volume by using flag `-v /a/b` during `docker container run` where /a/b is the path inside the container +- named volumes are managed by docker too +- unlike anonymous volumes, we don't lose the reference to named volumes after the container is deleted +- use flag `-v` to create named volumes, e.g. `-v feedback:/app/feedback`, where the name of the volume is feedback and the directory of the container it maps to is `/app/feedback` +- bind mounts are managed by us. it can be used for source code, so that the changes that we make to the code get reflected in the container +- in case of bind mounts, we have access to the folder which gets mapped to the container's folder +- in case of clashes, the more specific paths win e.g. if we are using bind mounts for /app of container and anonymous volumes for /app/node_modules of container, /app/node_modules relies on anonymous volumes +- using nodemon with bind mounts prevents us from rebuilding images repeatedly i.e. our changes in source code are accounted for in the running container +- we can use suffix `:ro` so that it specifies to the container that the volume is read only e.g. `-v $(pwd)/app:ro`, so that only hosts and not containers can edit the source code +- note: `docker volume ls` will not list bind mount volumes, since it doesn't manage them +- `docker volume rm volume_name` to remove volumes +- `docker volume prune` to remove volumes not being used by any containers +- `docker volume inspect volume_name` to get details of the volume + +## Arguments and Environment Variables + +- docker supports build time arguments and runtime environment variables +- runtime environment variables can be provided using `ENV PORT 80` inside the Dockerfile +- we can also provide it dynamically using `-e PORT=80`, which makes the earlier method a default +- for situations like api keys where security is a concern, the method suggested above is better +- we can also use build arguments, i.e. dynamic variables used when building an image +- can be done using `ARG PORT=80` in the Dockerfile +- my understanding - so basically, arg is used by all commands above cmd and env is used by cmd? + +### Example + +```Dockerfile +ARG DEFAULT_PORT=80 +ENV PORT $DEFAULT_PORT +EXPOSE $PORT +``` + +- we are giving the value of the build argument to the environment variable +- if we don't provide a port, the port used by container is `80` +- now, we can change the default port while building an image using `docker image build ... --build-arg DEFAULT_PORT=9999 ...` +- we can also receive a dynamic port using `docker container run ... -e PORT=9545 ...` +- if we don't provide a port dynamically, the port specified for building of images gets used + +## Networks + +- there are three kinds of entities with which containers can communicate - + - internet + - host + - other containers +- containers can by default talk to the internet e.g. a public api +- for containers to talk to the host, we can replace localhost by `host.docker.internal` +- e.g. for containers to talk to mongodb running on our host machine, we can use `mongodb://host.docker.internal:27017/favorites` +- for containers to talk to other containers, we can use `docker container inspect ...` to get the container's ip address (available in the key IPAddress) and then use it. e.g. with a mongodb container running, we run `docker container inspect mongodb` and then use `mongodb://the_ip_address:27017/favorites` +- this is not ideal, as this IP could change after a new container replaces the old one +- we can create a docker network, and all containers placed inside the network can reference each other directly using the container names, e.g. `mongodb://mongodb_container_name:27017/favorites` +- `docker network create network_name` to create a network +- `docker container run ... --network=network_name ...` to create a container inside a specific network +- also, we don't need `-p` for the container to which another container connects, i.e. `-p` is only needed when we want our host port to map to the container port, not when another container wants to communicate with it +- docker networks support different kinds of drivers. the default driver is bridge, which we saw above +- there can be other types of drivers and third party plugins for drivers as well +- we can use driver as "host" so that isolation between the container's network and localhost is removed +- examples of usage - `docker network create --driver bridge` or `docker container run --network host` +- we can clean up unused networks using `docker network prune` +- the bridge type of network uses network namespaces behind the scenes. so, on running `ip addr`, we see docker0, which is basically the virtual switch in network namespaces. each container is encapsulated inside its own network namespace. an e.g. is shown below - + ```sh + docker container run nginx + docker container inspect <> | grep SandboxKey + # the output is /var/run/docker/netns/<> + ``` + +## Docker Compose + +- docker compose helps in preventing having to run docker commands from cli repeatedly +- it has syntax in yml which is easier to read and can be shipped with our code +- services in docker compose are containers, for which we can define environment variables, network, image, etc +- version of docker compose I had to use was 3.8 based on my [docker engine version](https://docs.docker.com/compose/compose-file/) + - note - i think it comes with docker in newer version, i just have to use `docker compose` now +- all container names are one level nested under the services key +- can specify networks, volumes key for each container +- for named volumes, we should also mention them under the volumes key in the root of the file +- all the containers are a part of the default network created by docker-compose +- `docker-compose up` starts all the containers and builds the images as well + - flag `-d` can be used to start in detached mode + - add flag `--build` to force the images to be rebuilt +- `docker-compose down` - deletes all the containers and the default network that docker-compose creates + - flag `-v` also removes the volumes which were created +- use `depends_on` key to ensure the order in which containers start e.g. server `depends_on` mongodb container +- `docker-compose build` to build the images +- `docker-compose run service_name` to run a specific container in the compose file under the services key + +## Issues while Containerizing Frontend Apps + +- docker doesn't work in the web browser for e.g. when we make xhr requests + - so referring the backend application just by container name won't work as it utilizes docker networks + - so, we publish the backend on a host port and simply use localhost:that_port in frontend +- reactJS needs the terminal to be in interactive mode to ensure it continues to run + - it is like adding `-it` flag while using `docker container run...`, or setting `stdin_open: true` and `tty: true` inside of the docker compose + +## CMD and ENTRYPOINT + +- when we specify `docker container run image_name xyz`, xyz replaces what there is in CMD +- however xyz appends what is there in ENTRYPOINT +- we can replace what is there in ENTRYPOINT using `--entrypoint` +- useful tip - since a space separated command needs to be a part of different items in an array, use `sh -c`. i.e. `CMD ["a", "b", "c"]` can become `CMD ["sh", "-c", "a b c"]` + +```Dockerfile +FROM ubuntu +ENTRYPOINT [ "sleep" ] +CMD [ "10" ] +``` + +- `docker image build -t ubuntu-sleeper .` +- run `docker container run ubuntu-sleeper`, sleep is of 10 seconds +- run `docker container run ubuntu-sleeper 20`, sleep is of 20 seconds +- run `docker container run -it --entrypoint=bash ubuntu-sleeper`, run bash in interactive mode + +## Setup Containers + +how do we set up initial project e.g. how to run `npm init` when we don't have node installed locally? below is an example for setup using node + +Dockerfile.setup - + +```Dockerfile +FROM node:14-alpine +WORKDIR /app +``` + +docker-compose-setup.yml - + +```yaml +version: "3.8" +services: + npm: + build: + context: ./ + dockerfile: Dockerfile.setup + stdin_open: true + tty: true + volumes: + - ./:/app + entrypoint: npm +``` + +now, we can use commands to help during development like - + +- `docker-compose -f docker-compose-setup.yml run npm init` +- `docker-compose -f docker-compose-setup.yml run npm i express` + +the `npm` in the command is the service name inside docker compose, and entrypoint was given as npm in docker-compose, otherwise we would have to run `docker-compose -f docker-compose-setup.yml run npm npm init` + +## About Kubernetes + +- kubernetes is the most widely used container scheduler +- modern infrastructure is created using immutable images, and an upgrade is performed by replacing the older images with newer ones using rolling updates +- we specify how many resources to run and kubernetes maintains that number +- it ensures that the resources run within the specified memory and cpu constraints +- kubernetes is cloud-agnostic and can also be run on-prem +- it has features like service discovery, load balancing, secret and configuration management, etc + +## Minikube and Kubectl + +- minikube allows us set up a single node cluster on our local workstation +- minikube is useful for development purpose +- kubectl is the kubernetes command line tool which allows to manage a kubernetes cluster +- add alias to .bashrc - `alias kubectl="minikube kubectl --"` +- configuring autocomplete for kubectl (restart terminal after running the command) - + ```bash + echo 'source <(kubectl completion bash)' >> ~/.bashrc + ``` +- minikube can be deployed as a vm or as a container (i am trying as a container for now) +- configuring minikube - + - `minikube config set driver docker` + - `minikube config set memory 8192` + - `minikube config set cpus 4` +- view config using `minikube config view` or `cat ~/.minikube/config/config.json` +- start minikube - + ```bash + minikibe start + minikube status + ``` +- pointing docker client installed locally to minikube's docker daemon - + ```bash + docker container ls + minikube docker-env + eval $(minikube -p minikube docker-env) + docker container ls + ``` +- to ssh into minikube - `minikube ssh`. now also we can run commands like `docker container ls` etc +- to get all running components, we can use `kubectl get all --all-namespaces` +- to shut down minikube, use `minikube stop`. it preserves the state +- to start minikube again, `minikube start` +- to delete the cluster, `minikube delete` +- can format output e.g. `kubectl version --output=yaml`. output format can be json as well +- `minikube ip` to get the ip address of minikube cluster +- an issue on my laptop - minikube cannot pull docker images at times. temporary fix is to pull manually using `docker image pull` after pointing docker client to minikube's docker daemon + +## Cluster Architecture + +- the cluster has master nodes and worker nodes. note: there can be multiple masters in the cluster +- the master nodes schedule and monitor the containers assigned to it on the worker nodes +- different methods of viewing information related to the different components e.g. etcd + - `ps aux | grep etcd` + - `sudo cat /etc/kubernetes/manifests/etcd.yaml` + - `docker container ls | grep etcd` +- it has 7 major components as described below + +### Etcd + +- a distributed key-value store that allows for fast storage and retrieval +- it runs on the port 2379 +- etcdctl is the etcd control client which helps communicate with etcd +- it is used for storing and retrieving information about all kubernetes resources +- the etcd clusters can either be present on the master nodes or be entirely decoupled from them +- **kubeadm runs etcd as a static pod on the master nodes** +- we specify its ip address and port on the api server +- an example of using etcdctl api version 3 - + ```sh + kubectl exec etcd-minikube --namespace=kube-system -- sh -c \ + "ETCDCTL_API=3 etcdctl get / \ + --prefix --keys-only --limit=100 \ + --cacert /var/lib/minikube/certs/etcd/ca.crt \ + --cert /var/lib/minikube/certs/etcd/server.crt \ + --key /var/lib/minikube/certs/etcd/server.key" + ``` + to get the location of the certs, use `kubectl get pod etcd-minikube --namespace=kube-system --output=yaml` +- peer to peer communication in etcd clusters when there are multiple master nodes happens through 2380 +- etcd is distributed i.e. we can read from any of the instances, while all writes go to the master in the etcd cluster which syncs the data on the other replicas +- in case of inconsistencies, the quorum determines if the update is valid. it is the minimum number of nodes in the etcd cluster which should have processed the update, which is floor(n / 2) + 1. the value of fault tolerance is total instances - quorum. so, it is recommended to have an odd number of etcd instances / master nodes depending on the configuration, since fault tolerance is the same for n and n - 1 nodes where n is even + +### Api Server + +- it runs on the master node +- external clients like kubectl communicate changes to the cluster via the api server +- schedulers, controllers, kubelets, etc. monitor the api server for new resources +- they also send updates to the api server which then updates it on the etcd cluster +- so, api server is the only component that directly interacts with the etcd cluster +- the api server on the multiple master nodes can run concurrently i.e. all api servers on all the master nodes can be active at once. however, in case of controller manager and scheduler, to avoid duplication and inconsistencies, they are in the active state on the master node which is elected as the leader while they are in standby mode on the other master nodes +- in case of multiple masters, clients like kubectl interact with a load balancer, where the load balancer routes requests to the multiple api servers + +### Controllers + +- also called controller manager +- different kinds of controllers run on the master node +- for instance, the master node expects heartbeats from the worker nodes. the node controller monitors them and if the heartbeats do not reach the master nodes for a certain time period, the pods on it are evicted +- similarly, we have replication controller to maintain the number of pods of the same type +- the controller manager package installs all the different controllers. to view the different controllers, use - + ```sh + kubectl get pod kube-controller-manager-minikube \ + --namespace=kube-system --output=yaml | grep controllers + ``` + +### Scheduler + +- runs on the master node +- it assigns pods to a specific node +- it does this based on available resources like cpu and memory and filters out nodes which cannot run the pod +- it then based on a priority function ranks the remaining nodes +- the pod then gets scheduled on one of the remaining nodes + +### Kubelet + +- it **runs on all worker nodes** (and optionally on the master node). see how this is different from the components seen above - etcd, api server, controller manager, scheduler +- unlike the rest, kubelet does not run as a static pod, daemon set etc. it **runs via a binary installed on the vms** - i think this flow deviates since is doesn't use static pods / daemon set +- it registers the nodes with the cluster +- picks up the pods from the api server to run on the node and then runs it +- it then sends updates of the status of the pod to the api server +- so, to view information, use `ps aux | grep kubelet` +- this will show the files locations, so, for e.g., use - `cat /var/lib/kubelet/config.yaml` + +### Kube Proxy + +- it runs on all nodes, since it **runs as a daemon set** +- pods in a node can reach pods on other nodes as well because of this +- the kube proxy **assigns an ip to the service** +- to view the ip range from which services are assigned ip addresses, we can use `kubectl get pod kube-apiserver-minikube --namespace=kube-system --output=yaml | grep service-cluster-ip-range` +- it **configures ip tables**, which maps the ip address of services to endpoints +- an endpoint = the ip address of the pod + port of the pod. this port can belong to any one of the containers, set by target port field in the service definition +- if we have multiple pods sitting behind a service, an algorithm similar to round robbin is used + +### Kube Dns + +- it maps the service name to the service ip address +- so, it configures the dns server +- kubernetes uses coredns for achieving this functionality +- on running `kubectl get deployment coredns --namespace=kube-system --output=yaml`, we can see that a config map is mounted as a volume on it +- we get the contents of it using `kubectl get configmap coredns --namespace=kube-system --output=yaml`. it shows the plugins being used by coredns +- there is also a service associated with kube dns, which we can get using `kubectl get service kube-dns --namespace=kube-system --output=yaml | grep clusterIP`. the pods point to this ip, which can be confirmed by inspecting the pod using `kubectl exec any_pod_name -- cat /etc/resolv.conf` + +## Pods + +- the smallest unit in kubernetes +- represents a single running process +- a pod encapsulates one or more containers, but usually we run only one container in a pod +- sidecar pattern - helper containers can be spun alongside the application container in the same pod +- to create a pod in an imperative way using commands, use `kubectl run db --image=mongo` +- to get all running pods, use `kubectl get pods` + - to get more information, we can use `--output=wide`, `--output=yaml` or `--output=json` +- we can do a dry run and get the yaml, e.g. `kubectl run db --image=mongo --dry-run=client --output=yaml` +- to see the list of events that occurred, use `kubectl describe pod db` +- [yaml file to create a pod declaratively](https://gist.github.com/shameekagarwal/8ca1f31a5d76b00c20a5a8a6da3b183b) +- now, we can run `kubectl apply -f file_name.yml` +- we can specify the file name in commands instead of the resource name - `kubectl describe -f file_name.yml`, `kubectl delete -f file_name.yml`, `kubectl get -f file_name.yml` +- suppose we want to execute a command against a container. one way would be to issue commands using docker, e.g. `docker container exec container_name command`, this is just like spawning off another process in an already running container. however, we have to issue this command on a particular node of the cluster. this may not matter for minikube since everything is on our local but would matter for a production cluster. another way is to run `kubectl exec pod_name -- command`. this would by default execute the command on the first container of the pod. we can also specify the container using `--container` flag +- to view logs, use `kubectl logs pod_name`. like in `exec`, the container can be specified explicitly +- containers of the same pod run on the same node. they can talk via localhost i.e. if a container is running in a pod on port 8080, the other container can make requests to localhost:8080. they also share the same volumes +- if we stop the container using `docker container stop container_id`, the pod will restart the container +- to stop a pod, use `kubectl delete pod db` +- when a pod is deleted + - it first sends `TERM` (terminate) signal to all processes of all containers of the pod + - if it does not stop within the `gracePeriod`, `KILL` signal is sent for a forceful shutdown +- all containers part of the same pod coexist in the same node i.e. they cannot be distributed across the nodes +- all of them can also access the same volume + +### Process of Creation + +![pod creation](/assets/img/docker-and-kubernetes/pod-creation.drawio.png) + +## ReplicaSets + +- it is a type of controller i.e. it tries to maintain a specified number of pods +- this provides high fault tolerance, high availability, self-healing mechanism, etc. +- replica sets are the newer version of replication controllers, since replication controllers are deprecated +- note: when setting image using `kubectl set image...` in replica sets, i had to delete the older pods +- [yaml example](https://gist.github.com/shameekagarwal/ec1a7d3c31814c789eae2d0e1c1ae569) +- look how in yml syntax, `spec.template` is the exact same as that of the contents of a pod +- labels defined for a pod should match the labels defined for its replica set i.e. value of `spec.selector.matchLabels` should match `spec.template.metadata.labels` else kubernetes gives an error +- however, the replicaset can manage pods not defined in the `spec.template` section as well. in this case, the labels of pods should match the selector of the replica set +- `spec.replicas` defines the number of pods to run +- use `kubectl get replicasets` and `kubectl get pods` to verify +- verifying the self-healing feature - if we try to delete a pod using `kubectl delete pod pod_name`, we will see that the replica set will automatically spin up a new pod +- deleting the replica set will delete the pods it spun up as well +- `kubectl delete -f replica-set.yml --cascade=orphan`. this will delete the replica set but not the pods. so, in general, to prevent removal of downstream objects, use the `cascade` flag + +### Process of Creation + +only the first part has been described here, the remaining parts are similar to that of a pod + +![replica set creation](/assets/img/docker-and-kubernetes/replica-set-creation.drawio.png) + +## Services + +- pods are short-lived, so using addresses of pods for inter-pod communication is not reliable +- services can be used to expose pods, replication controllers, replication sets, etc +- the controller used here is called endpoint controller +- service can be of different types + - `NodePort` - target port on every node is exposed to the outside world. if we have multiple worker nodes, to hit a particular set of pods, we would have to use `worker_ip:node_port`. this also indicates that the services span multiple nodes without us having to configure anything + - `ClusterIP` - this is the default. exposes the port only inside and not from outside the cluster + - `LoadBalancer` - useful when deploying to cloud + - `ExternalName` - map a service to an external address like a url +- imperative command - `kubectl expose replicaset rs --name=svc --target-port=28017 --type=NodePort` +- note: [the node port cannot be specified](https://github.com/kubernetes/kubernetes/issues/25478) when using `kubectl expose` +- if we run `kubectl describe service svc`, we see that it has inherited all the labels of the replica set. recall how replica set is associated to the pods using labels, services are associated to the pods in the same way +- when describing a service, it also shows all endpoints aka pods it directs traffic to +- the three ports involved in node port are - + - node port - how to access from outside the cluster. hit `http://minikube_ip:NodePort`. if not specified, a free port is chosen at random for its value + - port - incoming traffic i.e. traffic from other pods or outside the cluster hit this port of the service + - target port - port of the pod to which the service should forward traffic. if not specified, it takes the same value as port. so, in yml, usually only the port is specified +- we can run `kubectl get endpoints` to get a list of all the endpoint objects. we can also get more information about a specific endpoint using `kubectl get endpoints endpoint_name --output=yaml` +- we can run `kubectl exec pod_name env` - here, we will get environment variables like `<>_SERVICE_HOST`, `<>_SERVICE_PORT`. this will have the ip address and port of the different services respectively +- communication - till now, we were using `<>` for communication. it can be expanded to `<>.<>`. if we don't specify the namespace-name, it defaults to the namespace in which the resource initiating the request is +- communication to services can be further expanded to `<>.<>.svc` or `<>.<>.svc.cluster.local`. this bit can be confirmed using `kubectl exec any_pod_name -- cat /etc/resolv.conf` under the search field + +### Process of Creation + +![service creation](/assets/img/docker-and-kubernetes/service-creation.drawio.png) + +## Liveliness Probe + +- used for configuring health checks, done at a container level +- if the health check fails, it applies the restart policy which defaults to always +- the restart policy is specified at the pod level and applies to all containers +- `initialDelaySeconds` - when should the probe start +- `timeoutSeconds` - after waiting for how many seconds should the probe fail +- `periodSeconds` - after how many seconds should the probe be repeated +- `failureThreshold` - how many consecutive health checks are allowed to fail +- code example + ```yaml + name: api + image: user-service + livenessProbe: + httpGet: + path: /actuator/health + port: 8080 + initialDelaySeconds: 20 + timeoutSeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + ``` + +## Readiness Probe + +- it is used to determine whether a pod is ready to serve requests +- it has the same configuration as liveliness probe +- ip addresses of unhealthy pods are removed from ip tables, so that the future requests do not make it to them + +## An Example + +- [a complete example](https://gist.github.com/shameekagarwal/1883a95d8be0a74030b77966d80196a0) of + - a database and exposing it using cluster ip + - backend service which talks to db, exposing it using node port, configuring health checks + +## Deployments + +- helps us achieve zero downtime when we deploy services +- we should not create pods or even replica sets directly +- deployments create replica sets behind the scenes +- when we make an update to for e.g. the image version, the deployment will first create a new replica set with the desired number of pods, and once that replica set has successfully scaled the pods, the deployment would mark the desired replicas of the older replica set as 0. a part of `kubectl describe deployment db` - + ``` + Type Reason Age Message + ---- ------ ---- ------- + Normal ScalingReplicaSet 12m Scaled up replica set db-5cc56bf6fb to 1 + Normal ScalingReplicaSet 4m22s Scaled up replica set db-76774bbdf to 1 + Normal ScalingReplicaSet 92s Scaled down replica set db-5cc56bf6fb to 0 + ``` +- a side note - the random characters that we see are actually the hash value of the pod template +- to create a deployment imperatively, use `kubectl create deployment nginx --image=nginx --replicas=2` + - we can also add flags `--dry-run=client --output=yaml` to generate the yaml +- deployment strategy can be rolling update (default) or recreate +- in recreate, the old pods are stopped and new ones are created in its place. this leads to some downtime. use recreate when the coexistence of two versions of the applications can cause inconsistencies e.g. db migrations +- in rolling deployments, the new replica set is scaled up and the old replica set is scaled down simultaneously gradually. they can be tweaked using `maxSurge` and `maxUnavailable` fields. at any given time, we can have a maximum of desired + `maxSurge` or a minimum of desired - `maxUnavailable` pods running. both can be absolute numbers or % and both default to 25%. since both versions of applications run in parallel, the response can be returned from either of the versions at random during deployment +- e.g. of rolling deployment - by using the following code, the deployment order is 3 old ➝ 3 old, 1 new ➝ 2 old, 1 new ➝ 2 old, 2 new ➝ 1 old, 2 new ➝ 1 old, 3 new ➝ 3 new + ```yaml + replicas: 3 + + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + ``` +- everytime we deploy in kubernetes, a rollout takes place and a revision is created +- we can monitor the status of the update to deployment using `kubectl rollout status -f deployment.yml` +- we can view the history of updates using `kubectl rollout history -f deployment.yml` +- we can also create a rollback using `kubectl rollout undo -f deployment.yml` + - if we want to go back to a much older version and not just the previous one, we can use `kubectl rollout undo -f deployment.yml --to-revision=2` +- side note: rollbacks might not always be possible e.g. if we had database migrations. so, we may need to roll forward in some cases i.e. implement a hot fix and redeploy the new changes +- using labels - + - `kubectl get all --show-labels` - show the resources with their labels + - `kubectl get all --selector=name=db,app=demo` - filter the resources using their labels + - e.g. to count the total number of resources in dev environment, use `kubectl get all --selector=env=dev --no-headers | wc -l` +- we can set image of a deployment using `kubectl set image deployment db db=mongo:3.3`, where the first db is the deployment name and the second db is the container name, since we can have multi container pod +- to add the default change cause to the `kubectl rollout history ...` output, append commands with `--record`, e.g. `kubectl apply -f infra --record`. this flag is deprecated but i cannot find its replacement +- to scale deployments imperatively, use `kubectl scale deployment api --replicas=2` +- both in deployments and in services, any one of the labels on pod need to be present in `spec.selector` + +### Process of Creation + +- a deployment controller will watch for new deployment creation requests +- it will then create replica set definitions on api server +- after this, the process of replica set creation is continued + +## Imperative vs Declarative + +- in declarative, we just tell the desired state which kubernetes tries to achieve +- e.g. `apply` follows the declarative approach +- however, in the imperative approach, we have to give clear instructions +- all commands like `create`, `edit`, `replace`, `expose`, `run` etc. are imperative +- using declarative approach we can track configuration using version control as well for iac +- imperative approach can be used for hot fixes / experimental purpose +- when using `apply`, we can see the last yaml configuration converted to json which we had sent under `metadata.annotations` in `kubectl.kubernetes.io/last-applied-configuration`. this is used by kubernetes to keep track of changes and is only available when we use `apply` +- if for e.g. we use `edit` to edit a resource, and that resource is not allowed to be edited, we just use `wq` to exit out of vim, and then that file gets saved to /tmp. we can then use `kubectl replace --force -f <>` to replace the existing resource with our newly configured one + +## Ingress + +- it is like a layer 7 load balancer built inside the kubernetes cluster +- makes the services inside cluster accessible from outside +- we also want features like ssl termination, route requests based on domain, etc +- my understanding - recall how a service based on labels can only expose a set of pods. instead of multiple node ports / load balancers i.e. one for each set of pods, we have one node port / load balancer which directs traffic to the ingress service. the ingress service can then direct traffic to the different cluster ips in the cluster +- kubernetes provides the ingress resource but not the ingress controller i.e. it provides the api which can be utilized by other third party implementations +- minikube has an addon that can be enabled + ```bash + minikube addons enable ingress + minikube addons list | grep ingress + ``` +- to verify, `kubectl get all --all-namespaces` should show the `ingress-nginx-controller-*` pod running +- ingress is spun up using a deployment and a node port to expose it outside the cluster +- it also deploys configmaps to manage configuration and cluster roles to monitor kubernetes resources +- all resources are deployed in the ingress-nginx namespace +- we can also hit the endpoint http://minikube_ip/healthz to verify the working of ingress +- we can also provide a domain so that the requests are routed based on domain names +- we can also provide a catch-all entry +- [in this example](https://gist.github.com/shameekagarwal/97db31a89ba766cf2d0634c561a1b3e9), if requests come from custom-api.com, and start with request path `/api` they are routed to the api service, but all other requests are routed to the devops service +- note: to simulate that requests are coming from a specific domain on our local, we can use `curl -H "Host: custom-api.com" http://192.168.49.2/api/` +- the ingress resource provided by kubernetes has limited functionality, so to configure the ingress controller provided by third party, we use annotations +- e.g. we want traffic from ingress-service/calendar to our calendar-cluster-ip:port. so, the calendar prefix should be removed. we can do this by using the annotation below - + ```yaml + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / + ``` +- so, `/calendar` in `rules[x].http.paths.path` gets replaced by the value in `rewrite-target` which is `/` here +- use `kubectl get ingress` to view the ingress resources +- my understanding - view the port of the node port service `ingress-nginx-controller` inside the `nginx-ingress` namespace. this is the port we hit when making requests to worker_node_ip + +## Volumes + +- references to files and directories made available to containers +- the file system can be anywhere, e.g. outside the host as well i.e. this could be used for aws ebs as well +- e.g. it helps us preserve data across pod restarts +- there can be several types of volumes like host path, git repo (like host path but the path is a git repository) and even cloud specific like aws elastic block store +- empty dir volume type - if a container crashes, a new container is spun up in the same pod. however, if we don't specify any volume, the container crash results in a loss of data. this can be prevented using empty dir volume type, which can survive container restarts but not pod restarts. it is usually chosen as the default by third party manifests and is expected to be replaced by a better solution like nfs +- an issue with using host volumes - it needs to be available on each node so that pods on different nodes can have access to it, and this of course is not an issue with minikube +- so, in cloud, we should ideally mount an nfs on each node, else we would have to copy this file on all nodes. for e.g., we should use aws efs. the syntax should be similar + +### Example 1 + +- for docker client to be able to communicate to the correct docker daemon, use the file /var/run/docker.sock +- e.g. we want to run docker commands on the host from pods. so, the container running inside the pod should have docker client installed to issue docker commands, and it should point to the docker daemon of the host +- so, we can use `hostPath` volume type +- [full yaml here](https://gist.github.com/shameekagarwal/f1686cffac86159b5259142f3044f731) +- now, we run the pod using `kubectl apply -f docker.yml` +- then, we can issue commands like `kubectl exec docker -- docker image ls` to list the images on minikube + +### Example 2 + +- recall how for bind volumes in docker, we needed to specify a path in the host. the host now is minikube, so the host path needs to be that of minikube. before running `minikube start`, if i copy files to the path in ~/.minikube/files directory on my workstation, i can see those files in the root on minikube host. we can verify this using `minikube ssh` and then by running `ls /` +- so, suppose we want to specify a configuration file for prometheus +- we can copy this configuration file to minikube and then use host path volumes to reference it +- in this example, a better solution would have been to create a custom image and use `COPY` in the docker file + +```yaml +# ... +spec: + containers: + - # ... + volumeMounts: + - mountPath: /etc/prometheus/prometheus.yml + name: prom-conf + + volumes: + - name: prom-conf + hostPath: + path: /prometheus-conf.yml + type: File +``` + +## Config Maps + +- we can have different sources of configuration like environment variables, files, env files, literals, etc + +### Default Config Map + +- it is used to make calls to the kubernetes api from containers +- `kubectl get configmaps` - kube-root-ca.crt is the config map created by default +- `kubectl describe pods pod_name` will give the mount location of this config map. note that this config map may not be mounted to the pods in kube-system though +- `kubectl exec pod_name -- ls /var/run/secrets/kubernetes.io/serviceaccount` shows that there are three files - namespace, ca.crt and token +- on reading online, i see that this can also be a secret instead of a configmap + +### Mount Volumes + +- config maps can mount the configuration as volumes to running containers +- imperative command - `kubectl create configmap prometheus-config --from-file=prometheus-conf.yml` +- `kubectl describe configmap prometheus-config` +- using in the yml file - + ```yaml + spec: + containers: + - # ... + volumeMounts: + - mountPath: /etc/prometheus + name: prometheus-config + # ... + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + ``` + verify using `kubectl exec prometheus -- cat /etc/prometheus/prometheus-conf.yml` +- instead of providing a file, we can use literals, e.g. `kubectl create configmap --from-literal=foo=bar` +- in this case, if we use volume mounts, a file called foo would be created with its contents as bar + +### Environment Variables + +- e.g. create a file called .env + ``` + client-id=qwerty + client-secret=12345 + ``` +- `kubectl create configmap api-credentials --from-env-file=.env` +- my understanding - the difference between `--from-file` vs `--from-env-file` is in from file, kubernetes does not care about the file's content, while in from env file, it knows how to treat as different key value pairs, so that it can inject them all at once / individually as discussed below +- usage - + ```yaml + containers: + #... + envFrom: + - configMapRef: + name: api-credentials + ``` +- verify using `kubectl exec alpine -- env` +- we can also inject the variables of the config map individually - + ```yaml + containers: + #... + env: + name: CLIENT_ID + valueFrom: + configMapKeyRef: + name: api-credentials + key: client-id + ``` + +## Secrets + +- secrets are similar to config maps +- secrets can be of three types - + - docker-registry - for pulling images from private registry + - tls - for storing certificates + - generic - works like config maps, so can have sources like `--from-env-file`, `--from-file`, `--from-literal` +- creating a secret imperatively - + ```sh + kubectl create secret generic jenkins-credential \ + --from-literal=username=johndoe \ + --from-literal=password=incognito + ``` +- to retrieve the original value - + ```sh + kubectl get secret jenkins-credential --output=json + kubectl get secret jenkins-credential --output=jsonpath="{.data.password}" | base64 --decode + ``` +- to use the secrets, we put them into files /etc/secret/jenkins-user and /etc/secret/jenkins-pass - + ```yaml + spec: + containers: + - # ... + volumeMounts: + - mountPath: /etc/secrets + name: jenkins-credentials + + volumes: + - name: jenkins-credentials + secret: + secretName: jenkins-credential + defaultMode: 0444 + items: + - key: username + path: jenkins-user + - key: password + path: jenkins-pass + ``` +- we made it read only for all users using 0444 as the mode +- verify using `kubectl exec pod_name -- cat /etc/secrets/jenkins-pass` +- if creating secrets declaratively, the values should be base64 encoded first + ```yaml + # ... + data: + username: am9obmRvZQ== + ``` + using sh base64 utility - + ```sh + # to encode + echo -n johndoe | base64 + + # to decode + echo -n am9obmRvZQ== | base64 --decode + ``` +- the only difference between config maps and secrets is that secrets are stored in tmpfs (temporary file storage) thus leaving no trace on the nodes +- secrets should be combined with rbac for limited access +- **cons of using kubernetes secrets**: secrets are stored in plain text in etcd, so anyone with access to etcd can read the secrets. so, we should use solutions like hashicorp vault, integrating it with kubernetes is smooth + +### Docker Registry + +- by default, we use public docker registry +- sometimes we might need private registry +- we use `docker login` when using vanilla docker +- when using kubernetes, we can create the secret of type `docker-registry` + ```sh + kubectl create secret docker-registry registry-credential \ + --docker-server=...\ + --docker-username=...\ + --docker-password=...\ + --docker-email=... + ``` +- we can then specify the name of the secret in pod + ```yaml + spec: + imagePullSecrets: + - name: registry-credential + ``` + +## Namespaces + +- we can spin up multiple clusters to isolate the different environments. this can help prevent accidental changes to the production cluster +- however this has operational and resource overhead +- namespaces help us create different segments on a cluster +- namespaces are like virtual clusters +- we can scope resource limits and permissions to namespaces +- we use the "default namespace" by default +- we can run `kubectl get namespaces` to view all the available namespaces +- kube-public - the resources in this namespace are accessible to all (including unauthenticated) users +- kube-system - `kubectl get all --namespace=kube-system` shows the resources managed by kubernetes itself +- to create a namespace, use `kubectl create namespace staging` +- if we set the namespace in context, we do not have to repeatedly suffix commands by `--namespace=staging`. e.g. we can use `kubectl config set-context $(kubectl config current-context) --namespace=staging` +- when we delete a namespace, the cascading effect deletes all the resources within it as well. the command is `kubectl delete namespace staging` +- in the resource files, we can also specify the `namespace` key under `metadata` + +## Config + +- we can specify flags like `--key`, `--cert` and `--cacert` when making requests via curl to api server +- we can also specify flags in kubectl everytime, e.g. `kubectl get pods --client-key=... --client-certificate=... --certificate-authority=...` +- by default for all our requests using kubectl, the configuration is specified in ~/.kube/config. it also is like a kubernetes resource with `kind: Config`. so, instead of defaulting to ~/.kube/config in every command, we can specify the file using the `--kubeconfig` flag +- it has three parts - clusters, users and contexts +- clusters refer to the different kubernetes clusters that we would like to access +- the cluster requires the path to the ca server certificate and the api server address +- to get the server address, use `kubectl config view --output=jsonpath="{.clusters[0].cluster.server}"` +- the user requires the path to private key and signed certificate +- we can also provide the base64 encoded data directly instead of the path for the user / cluster +- contexts pair the clusters to users. so, they have the cluster, user and even the namespace to use by default +- the one used by default by kubectl is defined via `current-context` +- create a new cluster - + ```sh + cp ~/.minikube/ca.crt . + + kubectl config set-cluster johndoe \ + --certificate-authority ca.crt \ + --server https://192.168.49.2:8443 # cluster server address + + kubectl config get-clusters # verify that the cluster is created + ``` +- create a new user - + ```sh + kubectl config set-credentials johndoe \ + --client-certificate johndoe.crt \ + --client-key johndoe.key + ``` +- create and set the context - + ```sh + kubectl config set-context johndoe \ + --user johndoe \ + --cluster johndoe # create / edit the context + + kubectl config use-context johndoe # change the context + + kubectl config get-contexts # verify that the context is set + ``` +- we can view the entire config using `kubectl config view` or `cat ~/.kube/config` +- note: the context section can also take the namespace as an argument + +## Roles and Bindings + +- rules, comprise of - + - verbs, e.g. get, list, create + - resources, e.g. pods + - resource names + - api groups of the resources +- roles - they are a collection of rules. a role is applied to a namespace +- cluster role - same as roles but scoped to clusters +- resources like pods are namespaced while resources like nodes are cluster scoped. to get an exhaustive list, we can use `kubectl api-resources --namespaced=true` or set the flag to false +- subjects - can be - + - user + - service accounts - used by pods to interact with kubernetes api + - groups - collection of users and service accounts +- we also have role bindings and cluster role bindings +- `kubectl config get-users` - by default we have only one user minikube +- a few commands we can use include - `kubectl get roles`, `kubectl get clusterroles`, `kubectl get rolebindings`, `kubectl get clusterrolebindings` +- we can also use `kubectl describe clusterrole view` +- we already have some cluster roles and cluster role bindings created by default +- ones prefixed with `system:` should be generally avoided, so we can run `kubectl get clusterroles | grep -v system`. we get four roles, each of them has been described below +- view - can perform get, list and watch operations (verbs) on almost everything +- edit - everything that admin can do except modify roles and role bindings +- admin - everything that cluster-admin can do except modification to namespaces and resource quotas +- cluster-admin - can perform all operations. e.g. the default user minikube has this role. this can be verified by running `kubectl auth can-i "*" "*"` +- to verify if an operation can be performed, we can use for instance `kubectl auth can-i get pods` +- we can impersonate as someone else using `kubectl auth can-i get pods --as=johndoe` +- creating a role binding - + ```sh + kubectl create rolebinding johndoe \ + --clusterrole=view \ + --user=johndoe \ + --namespace=default + ``` +- verify using `kubectl describe rolebinding johndoe`. note: sometimes in kubectl outputs, the namespace field is empty when it is referring to the default namespace +- my understanding - role bindings can reference cluster roles, it just means that the permissions would be granted on the specified namespace only. this allows for role reuse. the view role is a cluster role, but by creating a role binding we can limit the user's view capabilities to a namespace +- delete a role binding using `kubectl delete rolebinding johndoe` +- using role bindings, we can attach one role to multiple subjects +- declaratively creating a cluster role binding using yaml - + ```yaml + apiVersion: rbac.authorization.k8s.io/v1 + + kind: ClusterRoleBinding + + metadata: + name: johndoe-view + + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: view + + subjects: + - apiGroup: rbac.authorization.k8s.io + kind: User + name: johndoe + ``` +- note how role bindings have a single role but can have multiple subjects +- verify using `kubectl auth can-i get pods --as=johndoe --all-namespaces` +- describing the admin cluster role created by default - `kubectl describe clusterrole admin` +- e.g. yml to create a custom role - + ```yaml + apiVersion: rbac.authorization.k8s.io/v1 + + kind: ClusterRole + + metadata: + name: release-manager + + rules: + - resources: ["pods", "pods/attach", "pods/exec", "pods/log", "pods/status"] + verbs: ["*"] + apiGroups: [""] + - resources: ["deployments", "replicasets"] + verbs: ["create", "get", "list", "update", "watch"] + apiGroups: ["", "apps", "extensions"] + ``` +- note: to grant permissions for different operations on pods, specifying the resources as `pods` is not enough, as there can be other sub resources like `pods/logs` etc + +## NodeName + +- recall that the [scheduler](#scheduler) decides which node to schedule a pod on +- if we run `kubectl get pod pod_name --output=yaml`, we can see the node it was scheduled on under `nodeName` +- behind the scenes, a binding object is created which binds the pod to a node +- we can manually specify the node a pod should be scheduled on using the `nodeName` property +- we can use this for e.g. if we didn't have a scheduler, and this would schedule the pod on the specified node + +## Taint and Toleration + +- taint is set on nodes which prevent any random pod from being scheduled on it +- toleration is set on pods which allows them to be scheduled on a node with taint +- by default, the pods have no toleration +- use case - a worker node has resources to enable running of a certain type of pod +- it means that only pods with toleration **can be** scheduled on the node with that taint +- however, the pods with this toleration can be scheduled on other nodes as well +- this feature is used by kubernetes as well to help ensure that normal pods are not scheduled on the master and only the management pods scheduled by kubernetes itself are +- to taint nodes, use `kubectl taint node node_name key=value:taint-effect` +- similarly, to remove the taint, use `kubectl taint node node_name key=value:taint-effect-`, i.e. suffix the prior command with a `-` symbol +- taint effects can be - + - `NoSchedule` - do not schedule any new pods without the right toleration + - `PreferNoSchedule` - prefer not scheduling + - `NoExecute` - like `NoSchedule` but also evicts the existing pods on the node without the correct toleration +- to apply toleration on pods, use - + ```yaml + spec: + tolerations: + - key: key + operator: Equal + value: value + effect: NoSchedule + ``` + +## Node Selectors and Node Affinity + +- we add labels to nodes and then add selectors for them to pod definitions +- this way, the pods with the node affinity can only be run only on specific nodes +- however, pods without the node affinity can still be spun up on the nodes with labels +- to label a node, we use - `kubectl label node node_name key=value` +- we can use `kubectl get nodes --show-labels` to verify +- to apply selectors for labels on nodes, use - + ```yaml + spec: + nodeSelector: + size: large + ``` +- but, using node selectors we cannot specify complex conditions +- so, we use node affinity + ```yaml + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: size + operator: In + values: + - large + - medium + ``` +- what if the node labels are changed after the pod was already scheduled? what if there are no nodes found with the conditions matching the node affinity value? for these, the value can be `requiredDuringSchedulingIgnoredDuringExecution` or `preferredDuringSchedulingIgnoredDuringExecution` +- my understanding - `requiredDuringSchedulingRequiredDuringExecution` is not available by default +- some operators to use - `Equal`, `In`, `Exists` +- so, overall, to ensure pods of a particular type and only this type end up on a particular node, we need to use node selectors / node affinity and taints and tolerations in conjunction + +## Resource Management + +- we can give an indication and set limits for the resources that can be used by kubernetes components +- specified at the container level +- this helps kubernetes in scheduling +- to enable metrics server, use - `minikube addons enable metrics-server` +- can be written as for e.g. `0.5` or `500m` (500 milli cpu). 1 milli cpu is equivalent to 1 hyperthread / 1 vcpu +- memory can be written as `K` or `Ki` for kilobyte, `M` or `Mi` for megabyte and so on. we can only specify the numerical value as well, its value is in bytes e.g. `256Mi` or `268435456` +- syntax - + ```yaml + containers: + #... + resources: + limits: + memory: 100Mi + cpu: 200m + requests: + memory: 50Mi + cpu: 100m + ``` +- limits - amount of resources that containers should not cross +- if the container crosses the memory limit, it will be terminated / restarted. the pod has status `OOMKilled` (out of memory killed). the pod remains the same, the container changes +- containers are not allowed to use more than the cpu limit for an extended period, so there are no restarts / termination of the containers for crossing the cpu limits as cpu usage gets throttled automatically +- requests - amount of resources that containers are expected to use +- only when the node runs out of memory, the pod that the container exceeding the requests is a part of is evicted from the node, and it gets rescheduled +- if a container's memory request exceeds the available memory on any node (technically sum of the memory requests of all the containers of a pod), the pod stays in `Pending` state indefinitely +- if the memory usage exceeds only the requested amount (and not the limit), the pod can be evicted if another pod enters with a higher qos and needs that memory +- so, memory limit cannot be exceeded while memory request can be exceeded if the node has enough memory +- `kubectl describe nodes` gives details of the available and in use resources +- `kubectl top pods` gives cpu and memory usage details because of the metrics-server addon. to get information related to the containers in the pod as well, we can use `kubectl top pods --containers` +- similarly, we can use `kubectl top nodes` +- prometheus is a better solution than metrics-server for real world use cases +- qos - quality of service determines the priority - guaranteed > burstable > best effort +- guaranteed - resource limit = resource request. note: remember that if only limits are defined, request = limit +- burstable - at least one container has limit / request defined, unequal limits and requests, etc +- best effort - no resources are defined at all +- we can view the qos assigned by kubernetes using `kubectl describe pod pod_name | grep QoS` +- additional concept - priority classes are useful for e.g. when two pods have the same `qosClass`. we can run `k get priorityClasses` and then assign one of the values using `priorityClassName` under `spec` of the pod + +### Limit Ranges + +- limit ranges help us specify the following at a namespace level ([a yaml example](https://gist.github.com/shameekagarwal/75ae269c7c98c48c57ec215c9dbba20e)) - +- `default` - default resources limit +- `defaultRequest` - the default resources request +- `max` and `min` the maximum and minimum permitted values for the requests and limits +- `maxLimitRequestRatio` - the maximum limit to request ratio (limit should ideally be higher than request?) + +### Resource Quotas + +- limits the resources that can be consumed by a namespace. so, if we have multiple namespaces to support environments like dev and prod in our clusters, we can distribute resources equally so that there is no starvation for any of the environments. [a yaml example](https://gist.github.com/shameekagarwal/8343d4bb2e0029ee00f4d57f8f4b9306) +- using resource quotas, we can limit compute (e.g. cpu and memory requests and limits), storage (e.g. persistent volume claims) and object count (e.g. number of pods, number of node ports, etc.) + +## Daemon Sets + +- ensures one pod runs on each node +- e.g. logging and monitoring pods which need to be run on every node can be created using daemon sets +- even kube-proxy is run this way. to verify, use - `kubectl get daemonsets --all-namespaces` +- it used [node name](#nodename) but in newer versions [node affinity](#node-selectors-and-node-affinity) underneath + +## Init Containers + +- defined inside a pod +- before the long-running containers start, we might want to install binaries, etc +- init containers are run to completion one at a time sequentially before the normal containers start running +- their syntax in yaml is the same as normal containers + +## Static Pods + +- a pod created by kubelet itself on the node without involving the api server / etcd is called a static pod +- the kubelet continuously monitors a directory for changes +- so, when we can create a file in it, it gets picked up by the kubelet +- if we edit the file / remove the file, the kubelet automatically changes / terminates the pod accordingly +- this does not work for deployments etc. since they require controllers +- if the node is a part of a cluster, it will notify the api server about the static pod. so, `kubectl get pods` will show the pod, since the etcd cluster etc. know about these pods. however, unlike a normal pod, the only way to modify this pod is to modify the file +- use case - since static pods do not depend on control plane components like controllers, scheduler, etc., they are used to deploy the control plane components themselves +- unless configured otherwise, the directory is `/etc/kubernetes/manifests/` +- we can verify this in minikube after running `minikube ssh` by running `sudo ls /etc/kubernetes/manifests` that it has files for etcd, scheduler, api server and controller manager +- static pods will be suffixed by `-nodename` - `kubectl get pods --all-namespaces | grep minikube` +- if we run `kubectl get pod pod_name --output=yaml`, we can confirm that the owner is a node by going to `ownerReferences.kind` which should have the value `Node` +- to get the static pod path, use - `cat /var/lib/kubelet/config.yaml | grep staticPodPath` + +## Persistent Volumes + +- persistence of state should be decoupled from pods since they can be added / removed easily +- nfs is the way to go for disk storage in cloud. here, aws ebs has been shown +- note: ebs volumes should only be spun up in azs where worker nodes exist, since ebs is scoped to an az +- the `spec.capacity.storage` in the persistent volume defn. should be <= the capacity of ebs +- access modes can be `ReadWriteOnce`, `ReadOnlyMany`, `ReadWriteMany` +- we can run `kubectl get storageclasses` to get the available storage classes +- e.g. if we were using kops with aws, it would automatically add the storage class of gp2 for us +- default storage class admission controller observe requests for persistent volume claims and when a claim does not specify the storage class, it gets assigned the default storage class. when we run `kubectl get storageclasses`, we see that gp2 is marked as default +- [yaml example](https://gist.github.com/shameekagarwal/03e5e9dd6c43439d654792bb8822806d) of persistent volume +- persistent volumes are used through persistent volume claims. the idea is that admins create a set of persistent volumes, and developers use them via persistent volume claims +- there is a one to one mapping i.e. one persistent volume can only be used by one persistent volume claim +- `spec.storageClassName` and `spec.accessModes` should have the same value as that of persistent volume while the value of `spec.resources.requests.storage` should be <= the value of `spec.capacity.storage` so that the persistent volume claim can get a segment of the persistent volume +- because of this, if the persistent volume has more storage than what the persistent volume claim asks for, the claim gets the extra storage as well +- if no matching persistent volume is found, the persistent volume claim remains unbound indefinitely +- [yaml example](https://gist.github.com/shameekagarwal/a2afa15e76ee80c75a2dc19bfd234a54) of persistent volume claim +- usage - + ```yaml + spec: + containers: + - # ... + volumeMounts: + - name: jenkins-home + mountPath: /var/jenkins_home + + volumes: + - name: jenkins-home + persistentVolumeClaim: + claimName: jenkins-storage + ``` +- the status of a persistent volume can be - + - `Available` when no persistent volume claim is bound to it + - `Bound` when a persistent volume claim is bound to it + - `Released` when the persistent volume claim is deleted +- the default reclaim policy of a persistent volume is `Retain`. first, the pod / deployment is deleted, and then the persistent volume claim is deleted. now, the persistent volume has status of released. but it is not available to be bound because it already has existing data from previous pods which need to be deleted first +- so, we delete the persistent volume manually, try to clean up / delete the aws ebs manually and then can create new persistent volumes for the persistent volume claims +- till now, we used the manual method of provisioning volumes, i.e. static persistent volumes +- the dynamic method requires lesser intervention +- however, in case of a conflict, kubernetes will choose the static one +- important - the persistent volume is "created automatically" in case of dynamic persistent volumes, based on the persistent volume claim that we create +- when we delete the deployment and then the persistent volume claim now, the persistent volume as well as the actual nfs ebs volume is deleted automatically. this is because when using dynamic persistent volumes, the reclaim policy of the persistent volume is `Delete` +- [yaml example](https://gist.github.com/shameekagarwal/b5013b4645d62d287aeb2868ae37e5c3) for persistent volume claim for dynamic persistent volume +- the storage classes have a field called volume binding mode. this can be set to `WaitForFirstConsumer` i.e. persistent volume will not be bound to the persistent volume claim till there is a pod for the persistent volume claim. the other value that the binding mode can take is `Immediate` + +## Commands and Arguments + +- difference between command and entrypoint in docker is described [here](#cmd-and-entrypoint) +- for e.g. in the pod definition - + - `spec.containers[*].command` is used for replacing `ENTRYPOINT` of docker file + - `spec.containers[*].args` is used for replacing `CMD` of docker file + +## Patching Nodes + +- if we remove a node suddenly, the pods scheduled on it are lost +- if it was a part of a replica set, it would be rescheduled, but not if it was a normal pod +- to stop any further scheduling on the current node, run `kubectl cordon node_name` +- to stop any further scheduling and also evict existing pods, use `kubectl drain node_name` +- if the node included a pod not spun as a part of a controller, we have to add `--force`. this is because that pod would be lost forever +- so, i think drain already does what cordon does +- pods part of replica sets etc. will be rescheduled on other nodes, because that is the job of controllers +- after running the `drain`, we can start the upgrade +- to enable scheduling pods on the node again, run `kubectl uncordon node_name` +- my understanding - suppose a node goes down. so do the pods running on it. the time a controller waits to reconsider rescheduling the pod on another node is defined via `podEvictionTimeout`. this is why draining nodes is important, we don't rely on the timeout, and instead, rescheduling of pods happens gracefully +- to verify, use `kubectl describe node <> | grep Unschedulable` + +## Interface + +- for implementations to work with kubernetes, they should be compatible with the interface +- this allows for kubernetes to be extended with multiple implementations seamlessly +- e.g. cri for container runtime interface, cni for network, csi for storage, etc +- e.g. csi lays a set of rpc calls that the cri will make. the csi implementations should implement these rpcs diff --git a/_posts/2023-12-04-java.md b/_posts/2023-12-04-java.md new file mode 100644 index 0000000..2a74d50 --- /dev/null +++ b/_posts/2023-12-04-java.md @@ -0,0 +1,1578 @@ +--- +title: Java +--- + +## Object Oriented Programming + +### Basics + +- a java program can have any number of classes. the classes can have any name and the java program can have any name + - however, only one public class is allowed in a java program + - the name of both the public class and the java program should be the same +- when we compile a class using `javac Durga.java`, the number of class files generated = number of classes present in that program + ```java + class A { + + public static void main(String args[]) { + System.out.println("class A"); + } + } + + class B { + + public static void main(String args[]) { + System.out.println("class B"); + } + } + + class C { + } + ``` +- output of above program -
+ ![java main output](/assets/img/java/java-main-output.png) +- three pillars of object oriented programming + - encapsulation - helps with **security** and **abstraction** + - examples - data hiding (access modifiers), packages + - inheritance - helps with **reusability** and **abstraction** + - polymorphism - helps with **flexibility** + - compile time examples - overloading, method hiding, variable hiding / shadowing + - runtime examples - overriding + +### Import Statements / Package + +- two ways for using classes from libraries - + - fully qualified name of the class + - import statement at the top - preferred +- two kinds of import statements - + - **explicit import** - `import java.util.ArrayList;` - preferred + - **implicit import** - `import java.util.*;` + - note - implicit import does not include sub packages +- by default - + - all classes under `java.lang` package are available and need not be imported + - all classes under the same package as the package of the current class need not be imported +- **package** - group of related java programs +- different packages can have the same java program, e.g. `java.util.Date` and `java.sql.Date` +- universally accepted naming convention for package naming in java - reverse of domain name +- if we write the following program - + ```java + package com.learning.java; + + class Test { + public static void main(String args[]) { + System.out.println("hello world"); + } + } + ``` +- and try compiling using `javac Test.java`, it compiles. but when i tried running `java Test.java`, it failed + ``` + Error: Could not find or load main class Test + Caused by: java.lang.NoClassDefFoundError: Test (wrong name: com/learning/java/Test) + ``` +- so we should actually compile using - `javac -d . Test.java` +- this generates the entire directory structure of com/learning/java in the current working directory and places the Test.class there +- we can now run it using `java com.learning.java.Test` +- packages help with implementing **encapsulation** - the entire complexity / functionality is viewed as one unit residing inside a package + +### Class and Method Access Modifiers + +- classes have two different **access modifiers** - `public` and `<>` + - **default** classes are only accessible from only within the package + - **public** classes can be accessed from anywhere outside the package as well +- for inner class, apart from `abstract`, `final`, `public` and `<>`, we can also have `static`, `private` and `protected` modifiers +- **members** have four different **access modifiers** - + - **public** - access method from anywhere, inside or outside package + - **default** - can be accessed from inside package only, not outside the package + - **private** - can only be accessed from within the same class, not outside the class + - **protected** - can be accessed from anywhere inside package, and from subclasses if outside package as well + - small note - if accessing protected members from inside subclass but from outside package, only subclass reference can be used, not superclass (i.e. even polymorphism is not allowed) +- protected example - + - a/A.java - + ```java + package a; + + public class A { + + protected void a() { + System.out.println("from a"); + } + } + ``` + - b/B.java - + ```java + package b; + import a.A; + + class B extends A { + + public static void main(String[] args) { + A a = new A(); + a.a(); + } + } + ``` + - output -
+ ![protected caveat](/assets/img/java/protected-caveat.png) + - solution - change to - + ```java + B b = new B(); + b.a(); + ``` +- therefore, summary in tabular form - + + | visibility | public | protected | default | private | + |--------------------------------|--------|------------------------------|---------|---------| + | same class | ✅ | ✅ | ✅ | ✅ | + | subclass same package | ✅ | ✅ | ✅ | | + | non subclass same package | ✅ | ✅ | ✅ | | + | subclass different package | ✅ | ✅ (subclass reference only) | | | + | non subclass different package | ✅ | | | | + +- note - think about member visibility only when class is visible first (recall default vs public) +- access modifiers also help achieve **encapsulation** - interact with data members via exposed methods, not directly + +### Abstract Classes And Interfaces + +- `abstract` modifier is applicable for both methods and classes +- abstract method is used when we do not know about the implementation of the class upfront. e.g. Vehicle class can have an abstract method `getNumberOfWheels`. syntax - + ```java + public abstract Integer getNumberOfWheels(); + ``` +- if a class contains *even one* abstract method, it would have to be declared as abstract as well +- if a class is `abstract`, instantiation is not possible for the class +- also, if for e.g. we would not like for it to be possible to instantiate a class, we can declare it as abstract even if it does not have abstract methods +- subclasses are responsible to provide the implementation of the abstract methods of super class +- we can have multiple levels of nesting for abstract classes as well - abstract class Vehicle -> abstract class Car -> class RangeRover +- **interface methods are `public` and `abstract` without us specifying anything** +- so, when overriding in subclass, "method should be public" +- when **implementing** an interface - + - either override all the methods of the interface + - or make the class itself abstract +- code example - + ```java + interface I { + + void m1(); + void m2(); + } + + abstract class A implements I { + + public void m1() { + } + } + ``` +- abstract variables are not supported - so only one kind of member i.e. method is allowed for abstract, not variable +- so why use abstract classes and interfaces - + - **mandating a structure for an implementation** - "mandates" subclasses to provide implementation, else there will be compile time error + - **acting as a specification / contract** - e.g. we write servlet api compliant code, but that same code can run on different vendors like jetty, tomcat, weblogic, resin, oracle http server, etc which are all implementations of the same servlet api. same applies for jdbc and the different sql compliant drivers as well. + - **abstraction** - client will not know / need not care about the internal implementation +- note - all variables inside an interface are public static final, so they need to be initialized then and there. no instance variables can be created for an interface + +### Inheritance + +- inheritance helps use **is a relationship** +- we use `extends` to implement this +- members of the **superclass** are **inherited** by the **subclass** +- so, subclass can use members of the superclass +- the other way around does not hold i.e. superclass reference cannot use members of subclass +- all classes are implicitly subclasses of `Object` +- main advantage of inheritance - superclass will contain common functionality, thus helping us avoid duplication of logic in subclasses +- types of inheritance - + - **single inheritance** - one superclass and one subclass. supported in java + - **multilevel inheritance** - one superclass has one subclass, and that subclass again acts as a superclass for yet another subclass. this too is supported in java + - **multiple inheritance** - multiple superclasses, one subclass. not supported in java for classes, but supported via interfaces + ```java + class C1 extends C2, C3 {} // compilation error + + interface I1 extends I2, I3 {} // works + class C1 implements I1, I2 {} // works + ``` + - **hierarchical inheritance** - one superclass, multiple subclasses + - **hybrid inheritance** - combination of multiple types of inheritance +- inheritance example -
+ ![inheritance](/assets/img/java/inheritance.drawio.png) +- confusion cleared - we just said every class extends object. if a class C1 extends another class C2, it is extending both C2 and Object. then isn't this multiple inheritance? why did we say java does not allow multiple inheritance? + - when we do not extend any class, we extend Object implicitly + - when we extend a different class, we do not extend Object directly. so, the root class in the chain which does not have any explicit superclass extends Object implicitly. so it is basically multi level inheritance and not multiple inheritance which helps extend this subclass extend Object indirectly +- note - `final` class cannot have a subclass + +### Polymorphism - Overloading + +- **method signature** - method name + argument types +- in java, **return type is not a part of method signature** +- when resolving method calls, method signature is what gets used +- so, it is a compile time error if we try to add two methods with same signature, even if they have different return types +- **overloading** - when a class has multiple method with same names but different argument types +- advantage - same method is being used for multiple implementations +- **static polymorphism** / **compile time polymorphism** / **early binding** - in case of overloading, the decision around which variation of method to use is made at compile time +- example - + ```java + class Overloader { + + public void printer(int x) { + System.out.println("printing an integer: " + x); + } + + public void printer(String x) { + System.out.println("printing a string: " + x); + } + } + + public class Overloading { + + public static void main(String[] args) { + Overloader overloader = new Overloader(); + overloader.printer(1); // printing an integer: 1 + overloader.printer("hello"); // printing a string: hello + } + } + ``` +- **automatic promotion** + overloading in java - if when overloading, an _exact_ match is not found for a primitive type, java promotes to the next available primitive type using the following rules - + - byte -> short -> int -> long -> float -> double + - char -> int -> ... +- so, if refer the example above - there is no overloaded method for char. so, we jump to the next type as follows - + ```java + overloader.printer('a'); // printing an integer: 97 + ``` +- if no promotion is possible, we get a compile time error - + ```java + overloader.printer(10.5); // Overloading.java:19: error: no suitable method found for printer(double) + ``` +- if there is a clash during overloading for superclass vs subclass, subclass gets priority +- e.g. `null` can be used both for `Object` and `String`. so, if a method is overloaded for both of them and we pass it `null`, it will call the `String` implementation +- if there is clash during overloading for two classes which are independent, compiler throws an ambiguous exception +- e.g. `null` can be used both for `String` and `StringBuffer`. so, if a method is overloaded for both of them and we pass it `null`, it will throw an exception + ```java + overloader.printer(null); // Overloading.java:24: error: reference to printer is ambiguous + ``` +- since method overloading is compile time, the decision is influenced by the reference, not by the instance +- e.g. if i do `Object x = new String("s")`, and a method is overloaded for both `String` and `Object`, the object version would be called, since the decision is made by the type of reference - if i have two variations - `m1(Object obj)` and `m1(String str)`, the `m1(Object obj)` variation would be called + +### Polymorphism - Overriding + +- superclass reference can hold subclass instance +- the other way around does not hold i.e. subclass reference can not hold superclass instance +- **overriding** - subclass redefines method of superclass +- variations - + - superclass reference pointing to superclass instance - superclass method would be called + - subclass reference pointing to subclass instance - subclass method would be called + - superclass reference pointing to subclass instance - subclass method would be called +- the third variation is what interests us - compiler only checks if superclass has that method defined +- the method is called actually called on the instance during execution +- **dynamic polymorphism** / **runtime polymorphism** / **late binding** - in case of overriding, the decision around which variation of method to use is made at runtime +- **co variant** - when overriding, we can return subclass type of what superclass returns + ```java + class Parent { + public Object m1() { + return null; + } + } + + class Child extends Parent { + public String m1() { + return "hello world"; + } + } + + class CoVariant { + public static void main(String[] args) { + Parent p = new Child(); + System.out.println("covariant response = " + p.m1()); // covariant response = hello world + } + } + ``` +- if superclass method is final, we cannot override the method and we get a compile time error +- if superclass method is non final, we can override the method, and can also make it final in the subclass +- if method is private, there is no concept of overriding, since it is treated like an internal method. so, even if we redefine the method with the same name in the subclass, the compiler would not complain +- access modifiers + overriding - when overriding, we cannot reduce the scope, but we can increase the scope + ```java + class Parent { + public String m1() { + return "from parent"; + } + } + + class Child extends Parent { + protected String m1() { + return "from child"; + } + } + ``` +- output - + ``` + attempting to assign weaker access privileges; was public + ``` +- so, conclusion for access modifiers and overriding - for `private` methods, overriding concept is not applicable, for others - + - superclass - `public`, subclass can be - `public` + - superclass - `protected`, subclass can be - `protected`, `public` + - superclass - `default`, subclass can be - `default`, `protected`, `public` +- exception - below is **of course** only applicable for checked exceptions and not unchecked exceptions. below will make sense automatically as well if we think about `Parent p = new Child(); p.m1();` + - if subclass does not throw an exception, superclass can or cannot throw an exception + - if subclass throws an exception, superclass should throw a superclass of exception as well +- superclass `public static void m1()`, subclass - `public void m1()` - compile time error +- subclass `public void m1()`, superclass - `public static void m1()` - compile time error +- subclass `public static void m1()`, superclass - `public static void m1()` - works, but it is not overriding. it is **method hiding**. this resolution is compile time, happens by reference, and the superclass version is called + ```java + class Parent { + public static String m1() { + return "hello from parent"; + } + } + + class Child extends Parent { + public static String m1() { + return "hello from child"; + } + } + + class MethodHiding { + public static void main(String[] args) { + Parent p = new Child(); + System.out.println("parent reference, child object responds with = " + p.m1()); + } + } + ``` +- output - + ``` + parent reference, child object responds with = hello from parent + ``` +- conclusion - **method hiding** is also example of **compile time polymorphism** / **static polymorphism** / **early binding** just like **overloading** +- **variable hiding / shadowing** - there is no concept of overriding for variable members. so, if we redefine the variable in the child class as well, resolution happens like in method hiding + ```java + class Parent { String s = "parent"; } + class Child extends Parent { String s = "child"; } + + class VariableShadowing { + public static void main(String[] args) { + Parent p = new Child(); + System.out.println(p.s); // prints 'parent' + } + } + ``` +- TODO: add double dispatch? + +### Object Type Casting + +- syntax - `A b = (C) d` +- three checks - 2 compile time, 1 runtime +- compile time check 1 - C and d should be somehow related. either should be superclass of other + - passes compilation - + ```java + Object o = new String("hello world"); + StringBuffer sb = (StringBuffer) o; + ``` + - fails compilation - `incompatible types: String cannot be converted to StringBuffer` + ```java + String str = new String("hello world"); + StringBuffer sb = (StringBuffer) str; + ``` +- compile time check 2 - obvious - C should be subclass of A or same as A + - passes compilation - + ```java + Object o = new String("hello world"); + StringBuffer sb = (StringBuffer) o; + ``` + - fails compilation - `incompatible types: StringBuffer cannot be converted to String` + ```java + Object o = new String("hello world"); + String s = (StringBuffer) o; + ``` +- runtime check 1 - actual instance d should be subclass of C or same as C. understand how this is different from compile time check 1 - there, we were checking if whatever reference is used for d, that should be somehow related to C. here however, we check if the actual runtime object that d holds is a subclass of C or same as C + - passes runtime - + ```java + Object o = new String("hello world"); + String s = (String) o; + ``` + - fails runtime - `ClassCastException: class java.lang.String cannot be cast to class java.lang.StringBuffer` + ```java + Object o = new String("hello world"); + StringBuffer sb = (StringBuffer) o; + ``` + +## Constructors + +- constructor helps with **initialization** +- `new` keyword helps with **instantiation** +- for constructor, method name should be same as the name of class +- only applicable modifiers for constructors are access modifiers +- use case - make the constructor `private`. now, an object for the class can only be created from inside the class. this can help us for e.g. implement the **singleton pattern** +- **if we do not add any constructor** for a class, the compiler adds the **default no args constructor** for us automatically +- note - this default no args constructor is added for abstract classes as well +- first line in our constructor should always be calls to `super()` or `this()`. otherwise, exceptions like below are thrown + - `error: call to super must be first statement in constructor` + - `error: call to this must be first statement in constructor` +- if we do not add the super call ourselves, the compiler will automatically add `super()` for us +- note - this automatic adding of super happens for both constructors written by us and inside the default no args constructor +- convoluted example - + - our code - + ```java + class Test { + + Test(int i) { + this(); + } + + Test() { + + } + } + ``` + - what compiler generates + ```java + class Test { + + Test(int i) { + this(); + } + + Test() { + super(); + } + } + ``` +- when we have code like below, we get a compilation error because when the compiler generates `super()` automatically, it is not enough, since the superclass has only one constructor - the one we manually wrote. it requires an argument, which the compiler is not capable of defaulting + ```java + class Parent { + Parent(int i) {} + } + + class Child extends Parent { + } + ``` +- error - + ``` + Test.java:5: error: constructor Parent in class Parent cannot be applied to given types; + class Child extends Parent { + ^ + required: int + found: no arguments + reason: actual and formal argument lists differ in length + 1 error + ``` +- so, conclusion - we can only use either `super()` or `this()` and that too only in the first line of the constructor +- `super()` or `this()` can only be called inside a constructor and not inside any other method +- `this` and `super` keywords can also be used to reference instance variables +- note - `this` and `super` are always related to an instance, so they cannot be used inside `static` methods +- my doubt - how to handle variable hiding / shadowing for static variables if super is not allowed? solution - maybe use class prefix instead of super? +- constructor + overloading is a common pattern. we then can use `this()` inside them to call each other with default values for missing arguments +- a constructor can throw exceptions +- however, if superclass constructor throws an exception, the subclass constructor should throw the same exception or superclass exception of that exception. this is different from overriding, because here, the problem is the call to super because of which subclass has to throw an exception. it is not a superclass reference for a subclass instance. note that we cannot wrap with try catch, since super or this should be the first call + +## Strings + +- vvimp - **string** is **immutable**, **string buffer** (or builder) objects are **mutable** + ```java + class Introduction { + + public static void main(String[] args) { + String s = new String("Durga"); + s.concat(" Software"); + System.out.println(s); // Durga + + StringBuffer sb = new StringBuffer("Durga"); + sb.append(" Software"); + System.out.println(sb); // Durga Software + } + } + ``` +- `==` is for reference comparison. by default, `equals` in `Object` / custom classes work like `==` +- sometimes, classes can override this method, e.g. `String` class below overrides it for content comparison, while `StringBuffer` does not + ```java + class Equality { + + public static void main(String[] args) { + + String s1 = new String("durga"); + String s2 = new String("durga"); + System.out.println(s1 == s2); // false + System.out.println(s1.equals(s2)); // true + + StringBuffer sb1 = new StringBuffer("durga"); + StringBuffer sb2 = new StringBuffer("durga"); + System.out.println(sb1 == sb2); // false + System.out.println(sb1.equals(sb2)); // false + } + } + ``` +- heap is used for storing objects. string objects can be created when we use `new String()`, `str.concat("suffix")`, etc +- **scp (string constant pool)** is used for storing string literals. java stores them in the hopes of reusing them later +- note - scp a section in the heap itself, maybe it is present in a different location when compared to where java objects are stored +- while objects in heap are eligible for **gc (garbage collection)**, objects in scp are not, because java internally maintains references to the string literals stored in scp +- deeper understanding - scp is used for storing string literals. if i do `str.concat("suffix")`, suffix would be stored in scp, not concatenated result of str and suffix. the concatenated result will however be stored in heap +- so, it almost feels like that albeit in heap, scp is more of a compile time feature, while string objects are a runtime feature +- 2 in heap - (s1, String("durga")), (s2, String("durga")) and 1 in scp - "durga". s3 and s4 point to scp itself, while s1 and s2 point to both heap and scp. note how despite having the same string 4 times, it was stored only once in scp + ```java + String s1 = new String("durga"); + String s2 = new String("durga"); + String s3 = "durga"; + String s4 = "durga"; + ``` +- so, my understanding - whenever we have something in double quotes specified manually, all of that goes into the scp, while all other string objects created for e.g. manually using `new String...` etc go into the heap +- 3 in heap, out of which 1st and 2nd are eligible for gc - (,String("durga")), (,String("durga software")), (s, String("durga software solutions")) and 3 in scp - "durga", " software", " solutions" - + ```java + String s = new String("durga"); + s.concat(" software"); + s = s.concat(" solutions") + ``` +- in below examples, we compare equality using `==` and not `equals`, maybe because equals should anyway do content comparison, but here we see which references point to the same object +- equality of string literals - (equals compares reference and if not same, contents, while == just compares reference, and that evaluates to true since sl1 and sl2 are both pointing to the same object inside scp) + ```java + String sl1 = "durga"; + String sl2 = "durga"; + System.out.println(sl1 == sl2); // true + ``` +- concatenation for string literals can happen at compile time as well, which is why slc1 and slc2 point to the same object stored in the scp. this is probably happening due to optimizations that are performed on instructions + ```java + String slc1 = "durga software"; + String slc2 = "durga " + "software"; + System.out.println(slc1 == slc2); // true + ``` +- here, str2 is created at runtime, so str2 points to string object in heap while str3 points to string literal in scp. str2 does not point to a corresponding object in scp in this case + ```java + String str1 = "durga"; + String str2 = str1 + " software"; + String str3 = "durga software"; + System.out.println(str2 == str3); // false + ``` +- here, both strf2 and strf3 are created at compile time hence scp itself, because final variables would be replaced at compile time. understand how this behavior changed when compared to the example above, just by adding the `final` keyword + ```java + final String strf1 = "durga"; + String strf2 = strf1 + " software"; + String strf3 = "durga software"; + System.out.println(strf2 == strf3); // true + ``` +- the main advantage of scp - if a string is used multiple times, its instance need not be managed / tracked separately multiple times +- basically, jvm maintains a reference to strings in scp, so that there is no garbage collection happening there +- also, strings in scp cannot be mutated - when we make changes, new objects are stored in heap / new strings are stored in scp +- string buffers do not work like strings - string buffers do not use concepts like scp etc - so it is mutable - there is no referencing to same object in scp like concepts in string buffer +- in strings, `concat` and `+` both do the same thing +- other important methods in strings - `equalsIgnoreCase()`, `charAt()`, `length()`, `isEmpty()`, `substring()`, `replace()` (replace a certain character), `indexOf()`, `lastIndexOf()`, `toLowerCase()`, `toUpperCase()`, `trim()` +- **string buffer** - string is not meant for string content that can change frequently +- strings are immutable - for every change, a new object is created +- this is why we need string buffer. all changes we make happen on the same object +- since string buffer is mutable, it has two concepts - `capacity` and `length`. `capacity` determines how many characters the string buffer can hold, while `length` gives the current number of characters the string buffer has +- when we run out of space, memory is doubled, a new object is created and all the existing characters are copied +- other important methods in string buffer - `capacity()` (get the current capacity), `setCharAt()`, `append()` (works with most primitive etc types), `insert()` (insert a string at a specific position), `delete()` (delete substring from specified positions), `reverse()` (reverse contents of string buffer), `ensureCapacity()` (increase capacity to specified capacity upfront) +- note - all methods inside string buffer are synchronized - run `javap java.lang.StringBuffer` in terminal to view the **profile** of string buffer + ```java + public synchronized int length(); + public synchronized int capacity(); + public synchronized void setCharAt(int, char); + // and so on... + ``` +- so, at a time, only one thread can operate on a `StringBuffer`, thus affecting performance of applications +- so, we can also use `StringBuilder` +- the apis of **string builder** are almost the same as string buffer - so, it is like a "non synchronized version of string buffer" - run `javap java.lang.StringBuilder` in terminal - + ```java + public void setCharAt(int, char); + public int capacity(); + public int length(); + // and so on... + ``` +- so higher performance at the cost of race conditions which we might have to take care of ourselves +- side note - strings are automatically thread safe since they are immutable +- **method chaining** - because most methods in `String`, `StringBuffer`, `StringBuilder` return the same object type, we can use method chaining technique + +## Exceptions + +- Throwable + - Exception + - RuntimeException + - ArithmeticException + - NullPointerException + - etc + - IOException - used when doing file related operations etc + - InterruptedException - used in multithreading related code etc + - etc + - Error - out of memory error, stack overflow error, etc +- **unchecked exceptions** + - runtime exceptions and its subtree + - error and its subtree +- everything else is **checked exception** +- **try with resources** - cleaner code, no need to call `close` explicitly, if they use the interface `AutoCloseable` + ```java + try (BufferedReader br = new BufferedReader(new FileReader("file.txt"))) { + // ... + } + ``` +- note - what we declare / assign inside the try statement is final, and cannot be reassigned +- `Closable` extends `AutoClosable`. `Closable` throws `IOException`, while `AutoClosable` throws `Exception` which is more generic +- when we are handling exceptions, it might happen that we lose track of the original exception, and throw another exception which is not that relevant. e.g. + - reading from a resource fails due to missing file + - closing the resource fails due to null pointer because the resource was never initialized properly +- the eventual exception we get is the null pointer, but the missing file exception would have helped us more in identifying the root cause +- so, we can also use `ex.addSuppressed(Throwable t)` or `Throwable[] t = ex.getSuppressed()`. this way, we can also find the original cause behind the exception +- note - try with resources will automatically make use of suppressions for us bts +- another note - when using try with resources, the null pointer exception will be added as a suppression to the file not found exception, because understand that the main exception that happened in the try block was file not found exception, and the null pointer exception happened inside the finally block + ```java + class CustomResource implements AutoCloseable { + + public void read() { + throw new RuntimeException("could not read file"); + } + + @Override + public void close() { + throw new RuntimeException("a null pointer exception happened"); + } + } + + public class SuppressionExample { + + public static void main(String[] args) { + + try { + System.out.println("without try with resources"); + withoutTryWithResources(); + } catch (Exception e) { + System.out.println(e.getMessage()); + for (Throwable t : e.getSuppressed()) { + System.out.println("suppress: " + t.getMessage()); + } + } + + System.out.println(); + + try { + System.out.println("with try with resources"); + withTryWithResources(); + } catch (Exception e) { + System.out.println(e.getMessage()); + for (Throwable t : e.getSuppressed()) { + System.out.println("suppress: " + t.getMessage()); + } + } + } + + private static void withoutTryWithResources() { + CustomResource customResource = null; + try { + customResource = new CustomResource(); + customResource.read(); + } finally { + customResource.close(); + } + } + + private static void withTryWithResources() { + try (CustomResource customResource = new CustomResource()) { + customResource.read(); + } + } + } + ``` +- output - + ``` + without try with resources + a null pointer exception happened + + with try with resources + could not read file + suppress: a null pointer exception happened + ``` +- we can also catch multiple exceptions using a single catch block + ```java + try { + + } catch (NullPointerException | ArrayOutOfBoundsException e) { + e.printStackTrace(); + triggerAlert(e); + } + ``` + +## Generics + +- what is generics - + - helps extend java's type system - types now start acting like parameters that we as clients can provide + - to allow a type or method to operate on objects of various types to thus allow **reusability**. e.g. without generics, we would use overloading, which causes a lot of duplication of logic - + ```java + class OverloadingProblem { + + public static Double add(Double a, Double b) { + return a + b; + } + + public static Integer add(Integer a, Integer b) { + return a + b; + } + + public static void main(String[] args) { + System.out.println(add(1, 5)); + System.out.println(add(1.2, 5.3)); + } + } + ``` + - while providing **compile time safety** - e.g. without using generics, we would use type casting, which has two compile time checks but one runtime check, and catching errors at compile time > catching them at runtime - + ```java + class TypeCastingProblem { + + private static Object item = null; + + public static void setItem(Object item) { + TypeCastingProblem.item = item; + } + + public static Object getItem() { + return item; + } + + public static void main(String[] args) { + setItem(1.4); + Integer item = (Integer) getItem(); + } + } + ``` + output - + ![generics typecasting](/assets/img/java/generics-typecasting.png) +- we use the **diamond operator** for generics + ```java + class Pair { + + private K key; + private V value; + + public Pair(K key, V value) { + this.key = key; + this.value = value; + } + + @Override + public String toString() { + return "{ " + key + ": " + value + " }"; + } + } + + class GenericExample { + + public static void main(String args[]) { + Pair score = new Pair<>("maths", 85); + System.out.println(score); + } + } + ``` +- **generic method** - my understanding - this is useful when the class itself is not generic / maybe method and class generics do not mean the same thing, so we can for e.g. use `T` for class and `V` for method + ```java + class GenericMethod { + + public static void printer(T arg) { + System.out.println("value is: " + arg); + } + + public static void main(String args[]) { + printer(1); + printer("hello"); + } + } + ``` +- while the return type above is void, we could have for e.g. returned `T` etc as well +- **bounded generic types** - bound the types that are allowed to be used, to get access to the additional functionality that is present in the types used in these bounds, e.g. only allow `Number` and its subclasses to be used for a generic class containing mathematical utilities +- we use the `extends` keyword to achieve bounded generic types, and the target type should be a subclass of the interface / class mentioned in this clause + ```java + public static > T calculateMin(T a, T b) { + return (a.compareTo(b) < 0) ? a : b; + } + ``` +- e.g. [`copy`](https://docs.oracle.com/javase/8/docs/api/java/util/Collections.html#copy-java.util.List-java.util.List-) is implemented as follows - my understanding - this is to help make use of dynamic polymorphism + bounded types. note - just because we can do `superclass_reference = subclass_reference`, does not mean we can do `List = List` + ```java + public static void copy(List dest, List src) + ``` +- we can also specify multiple bounds using `&` +- **type inference** - determine the types automatically. some examples - + - java can automatically guess "the most specific type" that both `String` and `ArrayList` can work with - `Serializable` + ```java + class TypeInference { + + public static T getFirst(T a, T b) { + return a; + } + + public static void main(String[] args) { + Serializable result = getFirst("hello world", new ArrayList()); + } + } + ``` + - we use `List list = new ArrayList<>();` and not `new ArrayList()` + - we use `list.add("name")` and not `list.add("name")` +- note - just because `Number` and `Integer` are related via inheritance, it does not mean `List` and `List` are somehow related as well +- this is the motivation behind **wildcards** + ```java + import java.util.List; + + class Wildcards { + + private static void print(List list) { + list.forEach(System.out::println); + } + + public static void main(String[] args) { + List list = List.of(1, 2, 3); + print(list); + } + } + + // error: incompatible types: List cannot be converted to List + + // solution - notice the use of ? + // private static void print(List list) { ... + ``` +- **upper bounded wildcards** - when we use `?` and `extends`, e.g. allow all lists where the type of element is a subclass of the class specified in the generic method signature +- drawback - e.g. while we can print all elements of the list easily, we cannot add an element to the list - e.g. the list is actually of integer, and we might be trying to add a double to the list. since java cannot identify this problem, it gives a compile time error +- e.g. this works perfectly + ```java + private static void printList(List numbers) { + numbers.forEach(System.out::println); + } + + public static void main(String[] args) { + printList(List.of(1, 2, 3)); + printList(List.of(1.1, 2.2, 3.3)); + } + ``` +- however, if we add the below to the printList method - + ```java + private static void printList(List numbers) { + numbers.forEach(System.out::println); + numbers.add(7); + } + ``` +- we get the error below - + ``` + BoundedWildCardsExtends.java:7: error: incompatible types: int cannot be converted to CAP#1 + numbers.add(7); + ^ + where CAP#1 is a fresh type-variable: + CAP#1 extends Number from capture of ? extends Number + ``` +- **lower bounded wildcards** - when we use `?` and `super`, e.g. allow all lists where the type of element is a superclass of the class specified in the generic method signature +- so now, since java knows that the list passed to has elements of supertype of specified type, we can now add elements to the list of that type (dynamic polymorphism) +- drawback - we cannot read from the list - we have to treat the element as type `Object` + ```java + public static void addToList(List list) { + list.add(1.4); + } + + public static void main(String[] args) { + List list = new ArrayList<>(); + list.add(1); list.add("shameek"); + addToList(list); + System.out.println(list); + } + ``` +- use case of wildcards + bounded types - copy elements from one list to another - + ```java + public void copy(List source, List destination) { + source.forEach(destination::add); + } + ``` +- so, we should - + - use "lower bounded wildcards" when we want to perform some kind of mutation + - use "upper bounded wildcards" when we want to read values + - use "type parameters" when we want to do both reading and writing +- one difference between "type parameters" and "wildcards" is that type parameters allow for multiple bounds unlike wildcards, e.g. following is valid - `` +- rule of thumb? - use wildcards when possible, when not possible (e.g. we want to influence return type based on arguments), then use type parameters +- **type erasure** - java replaces all generic types we define with either Object, or the bound if a bound is specified +- as a part ofo this, java might introducing **type casting** etc as well +- e.g. the code below - + ```java + List list = new ArrayList<>(); + list.add(1); + Integer ele = list.get(0); + + class Store { T item; } + ``` +- is converted to this code due to type erasure - + ```java + List list = new ArrayList(); + list.add(1); + Integer ele = (Integer) list.get(0); + + class Store { Serializable item; } + ``` + +## Collections + +### List + +- `ArrayList` allows for control over **ordering** of elements +- all items are identified by an **index** +- items are located right next to each other in ram, thus making **random access via index o(1)** +- searching for items based on value is however o(n) +- adding items at the end is o(1) +- adding items at random positions is o(n), since it requires shifting of items by one position +- same logic is applicable for removal of items - o(1) for removing items from the end and o(n) for removing items from arbitrary positions +- size of array lists in java can change **dynamically** - once the amount of memory allocated gets over, a list with memory equal to double the size of the current list is provisioned, and all the items from the current list are copied over to the new list +- however, this ability to resize dynamically comes at a price - it takes o(n) time for this resize + copying over of items to the new location to happen +- however, when instantiating, we can provide the **initial capacity**, so that this resizing does not have to happen often +- disadvantage of array lists - when removing / adding items at random positions, a lot of **shifting** is needed to maintain the **contiguous** nature +- this problem is not there when using `LinkedList` +- since in linked lists, there is only a pointer to the next element that needs to be maintained +- disadvantage - linked lists don't allow random access with given index at o(1) time +- note - linked list in java is optimized - + - implemented as doubly linked list which allows it traversal in both directions + - maintains pointers to both head and tail - e.g. we can do use both `addFirst` and `addLast` at o(1) time +- linked list vs array list performance for adding elements at the beginning - + ```java + import java.util.List; + import java.util.ArrayList; + import java.util.LinkedList; + + class ListPerformance { + + public static void main(String args[]) { + perform("linked list", new LinkedList<>()); + perform("array list", new ArrayList<>()); + } + + private static void perform(String type, List list) { + long start = System.currentTimeMillis(); + for (int i = 0; i < 500000; i++) { + list.add(0, i); + } + long end = System.currentTimeMillis(); + System.out.println("time taken by " + type + ": " + (end - start) + "ms"); + } + } + ``` +- output - + ``` + time taken by linked list: 75ms + time taken by array list: 43375ms + ``` +- note - while we compared linked list to array lists above, as discussed later, if removing or adding to one of the ends, the most performant option we have is array deque, not stacks, not linked lists +- **vector** - **synchronized** implementation of **array list** i.e. all operations like add etc will do acquiring and releasing of lock +- generally, doing this using our own locks might be better, since we get more flexibility, e.g. batch multiple operations under one acquiring + releasing of lock +- **stack** - **lifo** structure (last in first out) +- important operations include `push`, `pop` and `peek` +- note - stacks use vectors underneath, so they are inherently synchronized + ```java + Stack stack = new Stack<>(); + stack.push("jane"); + stack.push("jackson"); + System.out.println(stack); // [jane, jackson] + System.out.println(stack.pop()); // jackson + ``` +- to avoid using synchronized version, we can use **array dequeue** instead + +### Queues + +- **queues** - **fifo** structure (first in first out) +- important operations include `add` (enqueue), `remove` (dequeue) and `peek` (retrieve but not remove last element) +- queues are abstract like stack as well - it is implemented using linked lists + ```java + Queue queue = new LinkedList<>(); + queue.add("jane"); + queue.add("jackson"); + System.out.println(queue); // [jane, jackson] + System.out.println(queue.remove()); // jane + ``` +- **priority queue** - objects being stored inside a priority queue should extend the `Comparable` interface +- this helps retrieve items form the structure in the order of their priority +- **dequeue** - double ended queue - o(1) for operating from either side of the collection. it is implemented by array dequeue and just like normal queues, we can implement it using linked lists instead as well +- note - java calls it deque and not dequeue + ```java + Deque dequeOne = new LinkedList<>(); + Deque dequeTwo = new ArrayDeque<>(); + ``` +- my doubt about performance - based on the fact that array dequeue might be using an array underneath, doing the typical "capacity resizing" that we discussed, would we have an even more performant solution if we were to use linked list? be it for implementing stacks or queues, logically performance of linked lists > array dequeues (dynamic capacity resizing issue) > stacks (synchronization issue) +- based on [this answer](https://stackoverflow.com/a/32625029/11885333), apparently not, because the main overhead that comes with linked lists is the extra creation of that node, garbage collection of that node, etc +- so, it is probably safe to conclude that in java, when we are looking for stack or queue implementation, we should use array dequeues almost always (over the stack since it is synchronized, and linked lists since it has memory overhead?) +- also, do not use array lists blindly - if we just have to remove and add elements to either ends, and do not need random access, array dequeues might be better than array lists (inserting at beginning of array list is o(n) and inserting at beginning of array deque is o(1)) + +### Maps + +- key value pairs +- also called **associative arrays** +- with maps, we ensure times of o(1) for **adding**, **removing** and **lookup** +- maps are **unordered** / do not support **sorting** +- the idea is that since **keys** in a map are **unique**, we transform the keys into an index between 0 to length - 1 of array using a **hash function**. then, accessing elements via the given key becomes o(1) - we just need to translate the key into an index using the hash function, and random access of elements in an array is an o(1) operation +- the hash function should be able to handle the type of key - e.g. if the key is an integer, using modulo operator with the length of array is enough, if the key is a string then ascii value of characters can be used and so on +- **collision** in hash tables - the hash function we used result in the same value for multiple keys +- **overwrite** - replace current value with new incoming value +- **chaining** - each **bucket** in the hash table can store a linked list. worst case scenario - all keys evaluate to the same value, so the entire map is just a single big linked list stored in one bucket, thus resulting in an o(n) complexity instead of o(1) +- **open addressing** - + - **linear probing** - try finding the next available empty slot - k + 1, k + 2, k + 3, ... disadvantage - **clusters** are formed i.e. elements with same hash are clustered together + - **quadratic probing** - try finding the next available empty sot using a quadratic polynomial - k + 1, k + 4, k + 9, k + 16, ... + - **rehashing** - perform another hashing on the key till an empty slot is found - h(h(h....(x))) +- so actually, worst case in hash tables for all operations - insertions, deletions and lookups are o(n) +- **load factor** - n / m, where n = number of items in the hash table and m = size of the array. if it is close to 1, the probability of collision will increase +- so, we can also do **dynamic resizing** of hash tables. disadvantage - this resizing is an o(n) operation +- in java, for `HashMap`, when the load factor becomes around 0.75, the dynamic resizing happens +- however, hash maps cannot be used in multithreaded scenarios, since they are not synchronized +- some important methods available in maps - `keySet()`, `entrySet()`, `values()` +- auto generated hash code example - look at how a prime number is used to generate a function with less collision chances + ```java + class Person { + + private Integer age; + + private String name; + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((age == null) ? 0 : age.hashCode()); + result = prime * result + ((name == null) ? 0 : name.hashCode()); + return result; + } + } + ``` +- note - the `equals` needs to be overridden as well. it might happen that due to chaining discussed earlier, multiple items end up in the same bucket of hash table. at that point, java might need to be able to differentiate between two different elements of the same hash +- so basically, java uses both chaining and dynamic resizing based on load factor by the looks of it +- `LinkedHashMaps` vs `HashMaps` - **linked hash maps** use a doubly linked lists underneath to track the "order of insertion", so the keys are basically ordered according to insertion time + ```java + Map hashMap = new HashMap<>(); + hashMap.put("aaa", 1); hashMap.put("bbb", 2); hashMap.put("ccc", 3); + System.out.println(hashMap); // {aaa=1, ccc=3, bbb=2} + + Map linkedHashMap = new LinkedHashMap<>(); + linkedHashMap.put("aaa", 1); linkedHashMap.put("bbb", 2); linkedHashMap.put("ccc", 3); + System.out.println(linkedHashMap); // {aaa=1, bbb=2, ccc=3} + ``` +- balanced bst (binary search trees) - **red black trees** and **avl trees** +- tree rotations are used to maintain this structure +- **tree maps** use red black trees unlike in hash maps, where an array like structure is used +- so, the keys are stored in sorted order in tree maps. notice how it is automatically fr us below - + ```java + Map treeMap = new TreeMap<>(); + treeMap.put("ccc", 3); treeMap.put("bbb", 2); treeMap.put("aaa", 1); + System.out.println(treeMap); // {aaa=1, bbb=2, ccc=3} + ``` +- because it uses trees, operations have a guaranteed complexity of o(log n) in tree maps, whereas operations have mostly o(1) but sometimes o(n) complexity in case of hash maps +- my understanding - since a bst is being used, concept of collision, load factor, etc do not exist in tree maps unlike in hash maps +- so, for huge workloads, while we might have to consider tuning the load factor in case of hash set, we do not have to think about it in case of a tree set +- note - in newer versions, hash maps does not use linked lists (chaining) for each bucket, it uses red black trees for each bucket. this further optimizes the hash maps now +- because of the very nature - using a red black tree per bucket, using an array to store the multiple keys, etc - memory required by hash maps > tree maps +- but remember, reducing time > reducing memory with cloud etc + +### Sets + +- they allow **no duplicates** +- **hash sets** and hash maps work in the same way - a one dimensional array is used to store the elements by performing a hash on the element +- some important functions - `add`, `remove`, `retainAll` (calling `set2.retainAll(set1)` will retain all the elements in the set2 present in set1, and remove other elements from set2) +- so, operations are mostly are o(1) but can be o(log n) in worst case / o(n) when dynamic resizing is needed +- again, **linked hash sets** are same as hash maps, the insertion order would be maintained, which is maintained with the help of an additional doubly linked list +- finally, **tree set** are same as tree maps - maintain elements in a sorted order using a red black tree underneath, thus making operations o(log n) in general +- tree sets come with their own additional methods - e.g. `subset(a, b)` will give us a new set with all values of the set present between a and b, `first` for getting the first element, etc + +### Sorting + +- sort - notice "reverse order" below - + ```java + List list = new ArrayList<>(); + list.add(3); list.add(2); list.add(1); list.add(4); list.add(5); + System.out.println(list); // [3, 2, 1, 4, 5] + + Collections.sort(list); + System.out.println(list); // [1, 2, 3, 4, 5] + + Collections.sort(list, Collections.reverseOrder()); + System.out.println(list); // [5, 4, 3, 2, 1] + ``` +- we can implement `Comparable` on our custom classes to be able to sort them directly - + ```java + class Person implements Comparable { + + String name; + + Integer age; + + Person(String name, Integer age) { + this.name = name; + this.age = age; + } + + public int compareTo(Person person) { + Integer nameDiff = name.compareTo(person.name); + Integer ageDiff = age.compareTo(person.age); + return ageDiff != 0 ? ageDiff : nameDiff; + } + + public String toString() { + return "Person(name=" + name + ", age=" + age + ")"; + } + } + + class CustomSortComparable { + + public static void main(String[] args) { + + List people = new ArrayList<>(); + people.add(new Person("ayan", 25)); + people.add(new Person("ruth", 5)); + people.add(new Person("jack", 25)); + people.add(new Person("jane", 25)); + people.add(new Person("mike", 20)); + System.out.println(people); + // [Person(name=ayan, age=25), Person(name=ruth, age=5), Person(name=jack, age=25), Person(name=jane, age=25), Person(name=mike, age=20)] + + Collections.sort(people); + System.out.println(people); + // [Person(name=ruth, age=5), Person(name=mike, age=20), Person(name=ayan, age=25), Person(name=jack, age=25), Person(name=jane, age=25)] + } + } + ``` +- `Comparator` use cases - + - we want to sort using multiple techniques. compareTo can only have one implementation, therefore lacks flexibility + - we want to sort a class not in our control i.e. we cannot change the class to make it implement `Comparable` + - also helps achieve separation of concerns + + ```java + class PersonAgeComparator implements Comparator { + + @Override + public int compare(Person person1, Person person2) { + return person2.age.compareTo(person1.age); + } + } + + Collections.sort(people, new PersonAgeComparator()); + System.out.println(people); + // [Person(name=ayan, age=25), Person(name=jack, age=25), Person(name=jane, age=25), Person(name=mike, age=20), Person(name=ruth, age=5)] + + Collections.sort(people, new PersonAgeComparator().reversed()); + System.out.println(people); + // [Person(name=ruth, age=5), Person(name=mike, age=20), Person(name=ayan, age=25), Person(name=jack, age=25), Person(name=jane, age=25)] + ``` +- using lambdas - for a more functional style, we can use the following syntax as well 🤯 - + ```java + Collections.sort( + people, + Comparator.comparing(Person::getAge).reversed().thenComparing(Person::getName) + ); + System.out.println(people); + // [Person(name=ayan, age=25), Person(name=jack, age=25), Person(name=jane, age=25), Person(name=mike, age=20), Person(name=ruth, age=5)] + ``` + +### Miscellaneous + +- some methods, refer docs for more - + ```java + List list = new ArrayList<>(); + list.add(5); list.add(1); list.add(2); list.add(4); list.add(3); + + System.out.println("original list = " + list); // original list = [5, 1, 2, 4, 3] + + Collections.shuffle(list); + System.out.println("shuffled list = " + list); // shuffled list = [3, 1, 5, 4, 2] + + Collections.reverse(list); + System.out.println("reversed list = " + list); // reversed list = [2, 4, 5, 1, 3] + + System.out.println("min = " + Collections.min(list) + ", max = " + Collections.max(list)); // min = 1, max = 5 + ``` +- since collections are pass by reference, make collections unmodifiable so that clients cannot mutate our collections + ```java + List unmodifiableList = Collections.unmodifiableList(list); + unmodifiableList.add(-1); + // Exception in thread "main" java.lang.UnsupportedOperationException + // at java.base/java.util.Collections$UnmodifiableCollection.add(Collections.java:1091) + // at MiscellaneousMethods.main(MiscellaneousMethods.java:20) + ``` +- if we want to obtain a synchronized version of the normal collections we can use `List synchronizedList = Collections.synchronizedList(normalList)` +- drawback - coarse grained locking is used, all methods use `synchronized` keyword now +- so, better solution is to use concurrent collections that java provides, e.g. `ConcurrentHashMap` + +## About Java + +- object oriented programming language +- biggest advantage - portability. this model is called wora - write once, run anywhere +- three main components of java - jvm, jre and jdk +- jvm - java virtual machine +- we write platform independent java code +- this java code then gets converted to bytecode (the .class files that we see) +- jvm however, is specific to platforms +- jit - just in time compiler - it is a part of the jvm +- it receives bytecode as input and outputs the platform specific machine code +- jre - java runtime environment +- jre contains the jvm and class libraries - `java.Math...`, `java.Lang...`, etc +- so, jvm comes as part of jre, and we need jre since it has all the important classes +- jdk - java development kit. it contains the following components - + - jre + - it contains the java compiler (javac) that converts java files to bytecode + - the capability helps us debug our java programmes, etc +- jse - java standard edition +- jee - java enterprise edition - contains the jse + apis around transactions, servlets, etc. renamed to jakarta ee +- jme - java mobile edition / java micro edition - for mobile applications + +## Maven + +- maven is a **build tool** for java +- other alternatives are gradle, ant, etc +- **build** - process of building source code into **artifacts** that can be run +- maven has various **plugins** - + - **jar plugin** to create jars + - **compiler plugin** to help compile code + - **surefire plugin** to execute tests +- a plugin has various **goals**. goals represent a unit of work +- to examine a plugin, we can use the following commands - + ``` + mvn help:describe -Dplugin=org.apache.maven.plugins:maven-compiler-plugin` + ``` +- **maven coordinates** - + - **group id** - company / department name. domain name in reverse order is the convention + - **artifact id** - project name + - **version** - + - **packaging** - there are two types of packaging - **jar** (mostly used nowadays and the default) and **war** (web application archive) + - **classifier** - e.g. we want to build for different versions of java but use the same pom. so, we can use classifiers like `jdk8` and `jdk11`. these then get appended to the version, so people can import the right dependency +- out of these, the **gav** (group id, artifact id and version) help us uniquely identify the project +- to use these libraries in our projects, we use **repositories** +- there two repositories - **local repository** and **remote repository** +- basically, maven downloads from remote repositories and puts it into our local repository +- then, our projects running locally can use dependencies downloaded in this local repository +- default location for local repository is ~/.m2/repository +- default url for remote repository is https://repo1.maven.org/maven2/ (called **maven central**) +- we can configure remote repositories via settings.xml - so that we can use our own remote repository - use case - companies maintain their own remote repository, which is a mirror of maven central + +### Plugin Management + +- **lifecycle** has **phases** +- a phase has multiple goals attached to it +- if a phase does not have any goals attached to it, it would not be executed +- e.g. the clean lifecycle has three phases - pre-clean, clean and post-clean +- only the clean phase of the clean lifecycle is attached to a goal +- it is attached to the clean goal of maven-clean-plugin plugin 🤯 +- when we say `mvn clean`, we are actually instructing maven to run the clean phase +- when we run a phase, all the phases before it in the lifecycle are executed - in this case pre-clean would be executed first (if it has some goals attached to it, it does not by default) and the clean phase itself +- we just discussed that we typically invoke `mvn <>`, which runs all the goals of all the phases up to before the specified phase's lifecycle. however, we can also invoke a particular goal using the following syntax variations - + - `mvn plugin_group_id:plugin_artifact_id:plugin_version:goal` + - `mvn plugin_group_id:plugin_artifact_id:goal` + - `mvn plugin_prefix:goal` + - `mvn plugin_prefix:goal@execution_id` - while executions help us tie goals to phases, we can also invoke these executions directly + + ```sh + mvn org.apache.maven.plugins:maven-clean-plugin:2.5:clean + mvn org.apache.maven.plugins:maven-clean-plugin:clean + mvn clean:clean + ``` +- there are two kinds of plugins - + - **reporting plugins** - run during site generation + - **build plugin** - run to help build the project +- below - we try to tie the run goal of maven-antrun-plugin to pre-clean and post-clean phases - + ```xml + + org.apache.maven.plugins + maven-antrun-plugin + 3.0.0 + + + 1 + pre-clean + + run + + + + Learning Maven: pre-clean + + + + + + 2 + post-clean + + run + + + + Learning Maven: post-clean + + + + + + + + Learning Maven: standalone invoking + + + + ``` +- so, now when we run post-clean phase, all three phases - pre-clean, clean and post-clean would be run +- configuring a plugin + - a plugin can have multiple execution blocks. each execution block specifies - + - what goal to run + - what phase to tie this goal to + - configuration for the goal + - a configuration element can be specified in the root as well. earlier point was us basically specifying multiple execution blocks, which helped us tie goals to phases. this point here is about specifying configuration in the root block of the plugin. this can be useful when we invoke the plugin:goal directly + - dependencies - if a plugin has dependencies, we can for e.g. specify the version of that dependency using this block + - inherited - by default, the plugin configuration is inherited by the children. we can disable this behavior by setting inherited to false +- id should be unique across all executions for a plugin (not across plugins) +- apart from clean, the two other lifecycles are default and site +- the goals that are triggered for the default lifecycle are dependent on the packaging type (recall packaging type can be one of jar or pom, it is a part of maven coordinates). for jar, this is the table - + + | phase | plugin:goal | + |------------------------|-------------------------| + | process-resources | resources:resources | + | compile | compiler:compile | + | process-test-resources | resources:testResources | + | test-compile | compiler:testCompile | + | test | surefire:test | + | package | jar:jar | + | install | install:install | + | deploy | deploy:deploy | + +- when we specify dependencies in dependency management of parent, child projects can get these dependencies if they want to, but don't get the dependency unless added explicitly. **plugin management** works in the same way - inherit all the configuration related to the plugin specified in the plugin management section of the parent, but do not get it by default unless the plugin is added explicitly +- extra - executing scripts using exec maven plugin! - + ```xml + + exec-maven-plugin + 3.1.1 + org.codehaus.mojo + + + Renaming build artifacts + package + + exec + + + bash + handleResultJars.sh + + + + + ``` + +### Inheritance and Aggregation + +- `` helps determine the xsd (scm schema definition) version to use i.e. what elements are allowed in the pom file, how they should be configured, etc +- multiple levels of **inheritance** is supported in pom +- all pom (directly or indirectly) inherit from the [**super pom**](https://maven.apache.org/ref/3.6.3/maven-model-builder/super-pom.html) +- this inheritance helps us extract out common functionality around plugins, plugin configuration, dependencies, etc to a parent pom from which all other projects can inherit +- we can print the effective pom like so - `mvn help:effective-pom` +- my understanding - the parent might be managed separately - + - parent would be downloaded from the remote repository into the local repository, post which it can be used + - for development purpose - build the parent, which will install it in the local repository, and then build the child +- the parent might be managed in the same project, in which we can provide the `relativePath`. understand that this way, we do not have to build the parent project separately like above - +- also, packaging type in parent can be specified to be `pom` instead of relying on the default value i.e. `jar` +- till now, we discussed inheritance. we can also use **aggregation** in maven +- use case - when we run a phase e.g. `mvn clean`, `mvn install`, etc., it gets run for all the child projects as well +- not only that - in aggregate projects, if the child projects depend on each other, maven can determine the right order to build them in for us automatically +- we can also use the same pom for both aggregation and inheritance +- notes about versions - + - version property of parent gets inherited by the children as well + - for specifying the version of parent in the child, we use `${revision}` + - for specifying interdependencies between children, we use `${project.version}` +- based on everything above, a simple multi module setup - + - parent - + ```xml + + 4.0.0 + + org.apache.maven.ci + ci-parent + ${revision} + + + 1.0.0-SNAPSHOT + + + + child1 + child2 + + + ``` + - child - + ```xml + + 4.0.0 + + + org.apache.maven.ci + ci-parent + ${revision} + ../pom.xml + + + org.apache.maven.ci + ci-child + + + + org.apache.maven.ci + child2 + ${project.version} + + + + ``` + +### Dependency Management + +- we can specify a range of versions using `[3.8, 4.0)` (`[` for inclusive, `(` for exclusive) +- version format - `<>.<>.<>-<>` +- the **qualifier** `SNAPSHOT` is used for unstable projects, which can change frequently +- this way, if we depend on a project with snapshot in its version, we get access to the latest code always +- my understanding - if for e.g. we do not use snapshot - + - if the local repository already has an existing copy, maven would not bother refetching it from the remote repository to refresh the local copy + - probably sometimes the remote repository also would not allow pushing artifacts again against the same version +- bts, this SNAPSHOT is converted to timestamp automatically for us - so, `x.y.z-SNAPSHOT` basically becomes `x.y.z-timestamp`, and thus this way, maven always tries pulling the latest version for us +- maven is able to handle **transitive dependencies** for us - if our project depends on jar a which in turn depends on jar b, maven is able to download jar a and then jar b automatically for us when building the project +- **classpath** - location of classes and packages that our project is dependent on +- the different **dependency scopes** - + - **compile** - include dependency in all classpaths. the default if `scope` is not specified explicitly + - **test** - only required for compiling and executing tests, not required when executing therefore need not be included when packaging artifacts + - **runtime** - include dependency when project executes or tests are being run, but do not include them when compiling. e.g. jdbc driver like mysql connector. use case - we as developers will not mistakenly depend on these libraries + - **provided** - dependencies provided by the environment. e.g. we are developing a web application, we would need to depend on the servlet api to compile, but we would not want to include this in the war file, since it would be provided to us by the [servlet container](/posts/spring/#rest) + - **system** - like provided, but the path when compiling is specified manually + ```xml + + io.datajek + some-dependency + 1.0 + system + ${project.basedir}/libs/dep-1.0.jar + + ``` + - **import** - when a dependency is of type **pom** and has the scope of import, it should be replaced by its dependencies in its `dependencyManagement` section +- **dependency mediation** - choosing what version of dependency to use +- default behavior - + - e.g. our project depends on A and B. A depends on D which again depends on E (version x). B directly depends on E (version y). our project would use E (version y), because if we imagine dependencies like a tree, E (version y) is the closest to root + - e.g. our project depends on B and A (B comes first in pom.xml). B depends on E (version x), while A depends on E (version y). our project would use E (version x), because B comes first +- so one technique based on above - if we would like to use version x of E invariably - place version x of dependency E as *early as possible* and *directly* inside the pom. this way, we end up using the verison x of E always +- when adding a dependency, if we use the `exclusion` tag along with it, the dependencies specified in the exclusion tag are excluded from the dependency tree - + ```xml + + io.datajek + project9-projectb + 1 + + + com.google.code.gson + gson + + + + ``` +- this means that we should either expect gson to come as a transitive dependency from another project, or include gson manually inside our pom as another dependency, etc +- lets say our project name is xyz, and we mark a dependency in our pom as **optional** +- it excludes this dependency from being added as a transitive dependency in any project that has xyz itself as a dependency +- **dependency management** section - this way, all the projects in for e.g. a team can specify the versions of dependencies that work well with each other in one place, and all of it gets inherited by all other child projects +- example - + - if a parent has the following section - + ```xml + + + + com.google.code.gson + gson + 2.8.6 + + + + ``` + - the child can skip the version of gson when adding it as a dependency + ```xml + + + com.google.code.gson + gson + + + ``` +- another use case of dependency management section 🤯 - helps with transitive dependencies as well - e.g. if our project has a dependency A, which depends on C (version x), and has a dependency B, which again depends on C (version y). if we add the dependency C (version z) in the dependency management section, version z of dependency is the one that maven uses! + - note - we could also have included dependency C (version z) directly in the dependency section to force maven to use version z (default behavior - closest to the root wins). however, if another project added this project as a dependency, even if it was not using dependency C (version z) directly, it would still have it being added to its classpath. this problem would not have happened in the first place if we had added dependency C (version z) in the dependency management section as described earlier + +### Build Portability + +- **build portability** - having consistent ways to build cross environments, machines, teams, etc +- **variables** - variables defined in parent are inherited by children +- however, children can override these variables +- project can be accessed using `project`, e.g. `${project.version}`, `${project.build.sourceDirectory}`, etc. the root element of our pom is `project`, so that is where these variables come from. another very useful one i found - `${project.parent.basedir}` if for e.g. a child project wants to access something from the parent directory +- whatever we define in the properties section can be accessed using the name of the property directly, e.g. `${MyString}` +- java system properties (what we access using `java.lang.System.getProperties()`) can be accessed using `java`, e.g. `${java.home}` +- environment variables can be accessed using `env`, e.g. `${env.PATH}` +- variables in settings.xml can be accessed using `settings`, e.g. `${settings.offline}` +- **profiles** - alternative configuration for overriding default values +- we can specify profiles either in the project specific pom.xml, or in settings.xml, which itself can be machine / project specific + ```xml + + 4.0.0 + + io.datajek + project14 + 1 + Project14 + + + + test + + + + org.codehaus.mojo + exec-maven-plugin + + + my-special-exec + + /Project14/testScript.sh + + + + + + + + + + prod + + + + org.codehaus.mojo + exec-maven-plugin + + + my-special-exec + + /Project14/prodScript.sh + + + + + + + + + + + + + + + org.codehaus.mojo + exec-maven-plugin + 3.0.0 + + + my-special-exec + clean + + exec + + + + + + + + + + ``` +- the most basic way in which we can specify which profile to use - `mvn clean -Ptest` +- another way ofo enabling a certain profile - inside the `profile` section we saw above, we can have an activation section like below - + ```xml + + + testProp + DataJek + + + ``` +- this means that the profile expects the system property testProp to be of value DataJek - `mvn clean -DtestProp=DataJek` +- **archetypes** - a project templating toolkit so that a new project can be created easily with all the for e.g. firm specific standards established in the project from the get go diff --git a/_posts/2023-12-27-spark.md b/_posts/2023-12-27-spark.md new file mode 100644 index 0000000..c63bafa --- /dev/null +++ b/_posts/2023-12-27-spark.md @@ -0,0 +1,1213 @@ +--- +title: Spark +--- + +## Introduction + +- spark - developed at uc berkley as an improvement of hadoop +- it borrowed concepts from hadoop but now works independent of it +- unlike hive etc, spark doesn't convert into the map reduce equivalent +- it is much more performant than hadoop +- it offered much more flexibility when compared to hadoop - + - unlike hadoop which relies on yarn as a cluster manager, it can work with different cluster managers like mesos and kubernetes. cluster manager, resource manager, container orchestrator, etc all mean the same thing + - unlike hadoop which relies on hdfs for storage, it can work with cloud storage options as well +- eventually, it became its own thing as it has much more features +- why spark is popular - we write for e.g. sql and all the complexities around distributed processing is abstracted away from us. also, it is a unified platform - all capabilities - including batch processing, stream processing and ml are in one platform +- databricks - founded by original developers of spark. for e.g. it makes deploying spark applications much easier, has an optimized runtime for spark, etc + +## Driver, Executor and Deployment Modes + +- we submit our spark application to spark +- spark creates a master (called **driver** in spark) and slaves (called **executor** in spark) +- the driver will just assign work to the executors, while the executors perform all the heavy tasks +- when we used `local[*]` in the java program, we basically used the **local cluster manager**. this is why we never had to build a jar and submit it to a spark cluster. `*` probably means spark will decide how many threads to use, but we can specify a number as well. a single jvm is used in this case. this is a useful alternative when testing out things in local. both the driver and executors are inside one jvm +- apart from local, the real cluster managers supported by spark include yarn, mesos and kubernetes +- now, there are two **deployment modes** for running spark in an actual cluster - + - **client mode** - the driver will be run on the client side. the client itself will spawn executors in the spark cluster. this is what happens when we use interactive clients like spark-shell. so, the driver dies automatically when the interactive shell is closed + - **cluster mode** - both the driver and the executors will run on the cluster. this is what happens when we submit built jars to a spark cluster +- i think deployment modes are not applicable when using the local cluster manager, since there is no actual cluster over there in the first place, since both driver and executors were inside the same jvm + +## Getting Started + +- download spark from [here](https://spark.apache.org/downloads.html) +- `tar -xvzf spark-3.5.0-bin-hadoop3.tgz` +- pom.xml - the junit configuration was needed because otherwise i was getting "cannot access class sun.nio.ch.DirectBuffer". i am using java 8 and latest versions of spark and junit possible + ```xml + + + 4.0.0 + + com.example + spark-batch + 1.0-SNAPSHOT + + + 1.8 + 1.8 + 2.13 + 3.5.0 + 5.10.1 + 3.2.1 + + + + + org.apache.spark + spark-core_${scala.version} + ${spark.version} + + + + org.apache.spark + spark-sql_${scala.version} + ${spark.version} + + + + org.junit.jupiter + junit-jupiter + test + + + + + + + org.junit + junit-bom + ${junit.version} + pom + import + + + + + + + + maven-surefire-plugin + ${surefire.version} + + + --add-opens=java.base/java.lang=ALL-UNNAMED + --add-opens=java.base/java.lang.invoke=ALL-UNNAMED + --add-opens=java.base/java.lang.reflect=ALL-UNNAMED + --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED + --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED + --add-opens=java.base/sun.nio.cs=ALL-UNNAMED + --add-opens=java.base/sun.security.action=ALL-UNNAMED + --add-opens=java.base/sun.util.calendar=ALL-UNNAMED + --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED + + + + + + + ``` +- app - notice how we derive master and file name from args, so that we can use the same spark code for running in both cases - when we use locally installed hadoop in pseudo distributed mode and when we use the local cluster manager + ```java + public class Main { + + private static final Logger log = Logger.getLogger(Main.class.getName()); + + public static void main(String[] args) { + String master = args[0]; + SparkSession spark = SparkSession.builder() + .master(master) + .appName("TablesDemo") + .getOrCreate(); + + log.info("reading file..."); + String fileName = args[1]; + Dataset surveyDf = read(spark, fileName); + + log.info("performing transformations..."); + Dataset countByCountryDf = countByCountry(surveyDf); + + log.info("final stats = " + countByCountryDf.collectAsList()); + + // try (Scanner sc = new Scanner(System.in)) { + // log.info("waiting for user acknowledgement"); + // sc.nextLine(); + // } + } + + protected static Dataset countByCountry(Dataset surveyDf) { + return surveyDf.filter(col("age").lt(40)) + .select("Age", "Country", "state", "Gender") + .groupBy("Country") + .count(); + } + + protected static Dataset read(SparkSession spark, String fileName) { + return spark.read() + .format("csv") + .option("header", true) + .option("inferSchema", true) + .option("path", fileName) + .load(); + } + } + ``` +- we also pause the program for user input when using local cluster manager so that we can view the spark ui - the spark ui would only be visible while the job is running. the spark ui is visible at http://localhost:4040/jobs/ +- we can override defaults at cluster level from ~/spark-3.5.0-bin-hadoop3/conf/. this includes files like log4j2.properties.template (for logging), spark-defaults.conf.template (for configuring what we specify via `SparkConf`), spark-env.sh.template (for properties like java home) etc +- writing tests - note how because we had broken our application down into separate chunks using different methods, we were able to unit test our application easily - refactor the transformations into separate methods, which input and output `Dataset`, then simply call this method in the unit test and call `collectAsList` on the output to view it as a list and assert on it + ```java + @TestInstance(TestInstance.Lifecycle.PER_CLASS) + public class MainTest { + + SparkSession spark; + + @BeforeAll + void setup() { + System.out.println("setting up spark..."); + spark = SparkSession.builder() + .master("local[*]") + .appName("Main") + .getOrCreate(); + } + + @AfterAll + void cleanup() { + System.out.println("cleaning up spark..."); + spark.close(); + } + + @Test + void whenCsvIsRead_thenDatasetIsReadSuccessfully() { + Dataset input = Main + .read(spark, "src/main/resources/sample.csv"); + + assertEquals(9, input.count()); + } + + @Test + void whenCountByCountryIsCalled_thenResultIsOk() { + Dataset input = Main + .read(spark, "src/main/resources/sample.csv"); + Dataset output = Main.countByCountry(input); + Map countMap = output.collectAsList().stream() + .collect(Collectors.toMap((a) -> a.getString(0), (a) -> a.getLong(1))); + + assertEquals(4, countMap.get("United States")); + assertEquals(2, countMap.get("Canada")); + assertEquals(1, countMap.get("United Kingdom")); + } + } + ``` +- understand that we could not have directly performed assertions on the dataframe, a dataframe is just partitions of data sitting in different executors. so, we first call `collectAsList()` to get all the data into the driver, and then we can easily perform assertions +- we could also have generated mock data as below, instead of reading from csv in tests like we did above. both methods have their own pros and cons imho - generating mock data repeatedly has a lot of code, while reading using a csv means slower test - by mocking data we can generate data specific for each test, while using a csv does help with cleaner code + ```java + @Test + void whenCountByCountryIsCalled_thenResultIsOk_unit() { + StructType schema = new StructType(new StructField[] { + DataTypes.createStructField("Age", DataTypes.IntegerType, true), + DataTypes.createStructField("Gender", DataTypes.StringType, true), + DataTypes.createStructField("Country", DataTypes.StringType, true), + DataTypes.createStructField("state", DataTypes.StringType, true) }); + + List rows = Arrays.asList(new Row[] { + RowFactory.create(37, "Female", "United States", "IL"), + RowFactory.create(44, "M", "United States", "In"), + RowFactory.create(32, "Male", "Canada", "NA") }); + + Dataset input = spark.createDataFrame(rows, schema); + Dataset output = Main.countByCountry(input); + Map countMap = output.collectAsList().stream() + .collect(Collectors.toMap((a) -> a.getString(0), (a) -> a.getLong(1))); + + assertEquals(1, countMap.get("United States")); + assertEquals(1, countMap.get("Canada")); + } + ``` + +### Using Spark Local Cluster Manager + +- launch.json - equivalent of run configurations in intellij + ```json + { + "version": "0.2.0", + "configurations": [ + { + "type": "java", + "name": "Main [Local]", + "request": "launch", + "mainClass": "com.example.spark_batch.Main", + "projectName": "spark-batch", + "args": ["local[*]", "src/main/resources/sample.csv"] + } + ] + } + ``` + +### Using Spark Submit + Hadoop + +- setup hadoop in pseudo distributed mode using [this link](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html) +- namenode format - `~/hadoop-3.3.6/bin/hdfs namenode -format` +- start all components using `~/hadoop-3.3.6/sbin/start-all.sh` +- create initial hdfs directories - `~/hadoop-3.3.6/bin/hdfs dfs -mkdir -p /user/$USER` +- copy the necessary files - + - `~/hadoop-3.3.6/bin/hdfs dfs -put src/main/resources/sample.csv` + - `~/hadoop-3.3.6/bin/hdfs dfs -ls` +- build the jar. note - i am able to work without generating fat jars / use shade plugin for now, but based on the use case, that might be necessary - + ```sh + mvn clean install + ``` +- submitting jobs to spark - note the arguments, we need to specify them explicitly since default value of master is local and default value of deploy-mode is client + ```shell + ~/spark-3.5.0-bin-hadoop3/bin/spark-submit \ + --verbose \ + --class com.example.spark_batch.Main \ + --master yarn \ + --deploy-mode cluster \ + target/spark-batch-1.0-SNAPSHOT.jar \ + sample.csv yarn + ``` +- other important options which we could have sent spark-submit include + - executor-memory, driver-memory - ram + - executor-cores, driver-cores - cpu cores + - num-executors - number of executors + +### Using Spark Shell + +- using interactive clients - `./bin/spark-shell` +- on starting spark shell, we can access the ui at http://localhost:4040/jobs/ +- e.g. we can run the following commands - + ```scala + val df = spark.read.csv("full_path_to_sample.csv") + df.show() + ``` +- note how we did not have to create a spark session manually here, unlike when writing spark applications + +## Dataframe + Hadoop + +- **dataframe** - distributed table with a well defined schema i.e. each column has a specific data type +- working with dataframe is like working with tables in sql +- data stored in for e.g. hdfs is broken into smaller **splits**. these splits are of size 128mb by default +- the dataframe too will basically be composed of smaller chunks called **partitions**, where each partition might represent the hdfs split +- my understanding - the fact that spark does everything using memory instead of using files like map reduce is what makes spark more performant when compared to hadoop as well +- above, we talked about storage / data, now we talk about compute +- we submit the job to spark +- spark will then with the help of yarn's resource manager create the application master in one of the worker nodes. recall how containers are used hadoop 2.x onwards, so this application master would be created inside a yarn / hadoop container +- now, the spark driver will run inside this application master container +- then, the spark driver will talk to yarn's resource manager and create more worker containers +- then, spark will spawn executors inside these worker containers +- now, each executor will be responsible for some partition(s) of data, which it loads in its memory +- while doing all this, spark will take care of **rack awareness** i.e. assigning executors the partitions in such a way that there is minimum network transfer between hdfs and executor / container + +## Spark CPU and Memory + +- recall we said that we can dictate how many cores an executor can have when submitting the spark job +- e.g. we say executors should have 4 cores. nowadays, each core can itself be split into virtual cores as well +- important - "a task uses one slot", and "a slot = one virtual core" +- so, number of tasks that can run in a cluster = (number of slots in the executor) * (number of executors) +- now, assume that spark computes that it needs 30 tasks but we only have 20 slots in our cluster +- spark is intelligent enough to schedule the 20 tasks and queue the remaining 10 tasks +- till now, we talked about cpu, and now we talk about memory +- note for pyspark - + - the driver will have a python component running alongside it. so, if for e.g. using yarn, inside the application master container, there will be a python process and the actual jvm driver + - if we use functionality / libraries of python not available in pyspark, then the executors too will have a python component running alongside them. so, if for e.g. using yarn, inside the container, there will be a python process and the actual jvm executor +- when setting memory limits, we have two variations for both executor and driver - + - `spark.driver.memory`, `spark.driver.memoryOverhead` - my assumption - `spark.driver.memory` is same as passing `--driver-memory` to spark-submit + - `spark.executor.memory`, `spark.executor.memoryOverhead` - my assumption - `spark.executor.memory` is same as passing `--executor-memory` to spark-submit +- the memory variant is for the actual jvm driver / jvm executor, while the memory overhead variant is for non jvm processes (like the one needed when using pyspark) +- so, e.g. we set `spark.executor.memory` to 1gb and `spark.executor.memoryOverhead` to 0.1 + - spark driver would ask yarn for containers having memory 1.1gb. note - this value should of course be lesser than a worker node's physical memory, otherwise we will get exceptions + - so, out of the 1.1gb for a container, 1gb would be allocated for executor jvm process, while the remaining 100mb would be allocated for non jvm processes, like the sidecar needed for pyspark +- there is more to how memory is broken down in spark, it is too much for my brain for now 😛 + +## Job Scheduling + +- there are two sides to this + - job scheduling across different applications - **dynamic resource allocation** + - job scheduling inside the same application - **spark schedulers** +- just covering from theoretical perspective, how to configure this can be found [here](https://spark.apache.org/docs/latest/job-scheduling.html) + +### Dynamic Resource Allocation + +- e.g. we have a spark job that uses up all the resources in our cluster +- now, we submit another small job +- but, this job cannot run since all the resources have already been used up +- a small job has to wait for the large job to complete +- so, spark has two strategies - static allocation and dynamic allocation +- **static allocation** - the default. the driver will ask for all the resources for its executors upfront. it will hold on to them for the entire duration till the entire job is over +- when we asked for some executors via the `num-executors` option, it meant that the spark driver would hold on to these resources for the entire duration of the job +- however, the number of executors the stages actually use can change dynamically +- remember - number of executors used in a stage depends on the number of tasks a stage has +- e.g. if a stage has 20 tasks, and we have executors with 5 slots (and sufficient memory), we will actually be using 20 / 5 = 4 executors +- but clearly, the number of executors actually needed by spark can change across stages +- so, we can instead use **dynamic resource allocation** - where instead of us manually specifying the number of executors to use, it is determined dynamically for every stage +- by default, static allocation is used, but we should consider using dynamic allocation if we are using a shared cluster for multiple jobs + +### Spark Schedulers + +- if our spark application has multiple jobs - + ```java + df1.join(df2).count() + df3.join(df4).count() + ``` +- by default, spark driver would execute this code synchronously. so, first all jobs for the first line would finish and then the all jobs for the second line would start and finish +- however, what if we use multithreading? - e.g. something like this - + ```java + Thread t1 = new Thread(() -> df1.join(df2).count()); + Thread t2 = new Thread(() -> df3.join(df4).count()); + t1.start(); t2.start(); + t1.join(); t2.join(); + ``` +- this means that the jobs for both would be triggered in parallel +- this is what we might actually want as well - what is the point of stalling the second job for the first job? +- however, when kicking off the jobs in parallel like this, they will content with each other for resources +- solution - by default, spark uses the fifo scheduler, but we can ask it to use the fair scheduler as well +- **fifo scheduler** - the first job gets priority. it gets all the resources for itself. then the second job gets the leftover, and would be stalled if not enough resources are available +- **fair scheduler** - assign resources to tasks in a round robin fashion. all issues like **starvation** (short job waiting for a long running job) etc are prevented + +## Transformations and Actions + +- spark dataframes are **immutable** +- we tell spark driver the **transformations** we would like to do +- these transformations are simple sql statements - e.g. filter where age > 40, projection of columns, grouping, etc +- each transformation then results in a new dataframe +- transformations can be further categorized into **narrow transformation** and **wide transformation** +- narrow transformation - each partition of data can be processed independently. a transformation on one partition is independent of a transformation on another partition. e.g. filtering +- wide transformation - partitions need to be **repartitioned**. e.g. in group by, all rows belonging to the same group need to be brought into the same partition. this process of repartitioning of data for a wide transformation is called a **shuffle** +- **execution plan** - we write the transformations one by one using a builder pattern. but spark might not execute the operations in the same way - it will construct an execution plan, which is an optimized version of our transformations - e.g. if we filter then use project, it would move the projection before the filtering +- **lazy evaluation** - spark will not execute the transformations immediately - it will build the execution plan described above and wait for us to call an **action**. actions include `read`, `write`, `collect`, `show`. the moment we call an action, the execution plan is triggered, and we see a **job** +- `collect` will basically collect all the data in the driver. so, be mindful of out of memory exceptions when performing this operation + +## Jobs, Stages and Tasks + +- our entire spark application is broken down into **jobs** +- a job is triggered only once an **action** is encountered (recall lazy evaluation) +- jobs are further broken down into **stages** +- stages are further broken down into **tasks** +- so, tasks are the unit of work +- a task basically executes on one **slot** of executor and is responsible for a partition of data +- a task is a bunch of narrow transformations +- all the tasks of a single stage operate in **parallel** +- each wide transformation results in a new stage, due to the repartitioning that is needed +- before the tasks of a next stage start, all tasks of the previous stage should complete, because that was the entire point behind wide transformation - it depends on all the previous stage's partitions and not just one +- when going from one stage to another, since data is being **shuffled** / **repartitioned**, data is temporarily written to a buffer which spark calls **exchange** +- so, the idea probably is to wait for all tasks of a stage to complete and then with the help of exchange, get the right partition of data to the right executor and finally kick off the tasks of the new stage +- this process of copying data from the **write exchange** to the **read exchange** is called **shuffle / sort** + +![job stages tasks](/assets/img/spark/job-stages-tasks.drawio.png) + +## Debugging Spark + +- debugging spark is not easy - all the code we write is first converted into an execution plan and is lazily evaluated +- so, when we place debug pointers in our code, we are just stepping through the driver thread (which is not even doing anything). we are not stepping through the executor thread actually performing those transformations +- we can however use **lambda accepting transformations** like `map`, `flatMap`, `forEach`, etc +- when we place debug pointer inside these lambdas, we will be able to see the executor thread performing them +- logs are the best way of debugging a production spark application, which is running in a distributed environment +- first step is to log using the log4j2 libraries that come as a transient dependency from spark libraries, which i did in the code snippet shown earlier +- second step would be to provide the appropriate log configuration like - + - log4j2.properties file to use. this can be cluster wide or application specific, depends on use case + - configure the file and console appenders, specifying file names for the file appenders + - actually locating the log files for the driver vs the executors in the cluster manager + +## Spark Structured APIs + +- a bit of history - + - spark first came up with rdd, which was a better alternative to map reduce + - then spark came up with dataframe api, which was easier to work with + - however, we could not use the regular lambda transformations like `map`, `filter`, etc which rdd had + - so, the idea would be that we would convert between rdd to dataframe to use these, e.g. dataframe has `toJavaRDD()` + - however, on going from dataframe to rdd, we would lose out on the optimizer + - so, spark then came up with dataset api - + - we can use for e.g. java pojos which would give us compile time safety + - it supported the regular lambda transformations + - dataframe is now `Dataset`. row is a generic object, so it does not have the compile time safety unlike if we use pojos + - note - apparently, in java, spark does not have the concept of `DataFrame`, so we should instead use `Dataset` anyway +- rdd stands for **resilient distributed dataset** + - resilient because if for e.g. there is a failure in one of the executors, spark knows how to load the partitions the failed executor was responsible for into a new executor + - distributed because spark partitions the data into smaller chunks and processes them in parallel +- spark calls its dataset api as **structured apis** +- structured apis basically use rdd underneath +- spark asks us to use structured apis where possible, since there is a **catalyst optimizer** (also called **tungsten**) sitting in between structured apis and rdd, so we lose out on the optimizations when using rdd directly +- use rdd only for specific use cases like custom partitioning +- using dataframe (i.e. dataset of row) vs dataset (i.e. dataset of a specific pojo) - + - dataset will have compile time safety - using `.filter((person) -> person.age > 40)` has compile time safety unlike `.where(col("age").gt(40))` + - dataset is less optimal when compared to dataframe - serialization is an important step in distributed computing. dataset serialization will use java serializers, while dataframe serialization will be able to use tungsten underneath, which is more performant +- there is also something called spark sql - we can write a string which will exactly look like sql. this long string would be an alternative to chaining methods in spark dataframe api - + ```java + Dataset inputDf = // read.... + inputDf.createOrReplaceTempView("survey"); + Dataset countDf = spark.sql("select Country, count(*) from survey " + + "where age > 40 " + + "group by Country"); + ``` +- this sql works just like dataframe, so there is no performance impact there +- based on everything above, i will probably use dataframe all the way. i would also use the java apis and not sql, since it has a bit better compile time safety / auto complete as compared to writing sql strings +- **spark engine** - this sits on top of the chosen cluster manager. recall how unlike yarn is a part of hadoop, spark does not come with a cluster manager, and supports yarn, mesos, kubernetes. spark engine acts as an interface between spark and the chosen cluster manager + +![spark ecosystem](/assets/img/spark/spark-ecosystem.drawio.png) + +## Execution Plan / Catalyst Optimizer Working + +- the **catalyst optimizer** works internally in following steps +- or we can say that spark executes the **execution plan** in the following steps - +- generate an ast (abstract syntax tree). any errors in our field names, sql function usage, etc would be caught here +- now we will have a **logical plan** +- perform optimization on our logical plan. the optimization here includes techniques like - + - **predicate pushdown** - push filtering operations earlier to reduce the amount of data transfer + - **partition pruning** - when writing to internal sources, we can specify partitioning scheme, and then there will be a different directory for each partition. this has been discussed in [data sources](#data-sources). predicate pushdown can go up to as early as only reading some partitions of data when loading the data into dataframes + - **projection pruning** - push projection operations earlier to reduce the amount of data transfer +- generate a bunch of **physical plans**, and associate a cost with each of them. e.g. one plan uses shuffle join, another uses broadcast join +- finally, a **cost model** evaluates the most optimal physical plan +- **wholestage code generation** - generate the bytecode to run on each executor + +![execution plan](/assets/img/spark/execution-plan.jpg) + +## Data Sources + +- data **sources** in spark can be external or internal +- **external** - external to spark. some notable ones include + - jdbc data sources - oracle, ms sql, postgres, mysql + - no sql - cassandra, mongo db + - cloud data warehouses - snowflake, redshift + - streaming sources - kinesis, kafka +- **internal** - this can be either **hdfs** or **cloud based storage** e.g. s3 (preferred) +- for internal source, there are several **file formats** which we have to consider. again, spark supports various file formats like parquet, json, csv, avro, etc +- there are two ways to access external sources - + - ingest using external tools to write data from external sources to internal sources. data goes unmodified from different sources into internal sources. then spark reads from these internal sources directly. useful when using spark for batch processing + - make spark directly read from these different external sources +- batch processing prefers first option because for e.g. our db capacity was provisioned with otlp workloads in mind, and might not be optimized for spark based big data workloads. thus, it helps decouple the two from performance, security, etc perspective +- stream processing prefers second option +- so basically, while we established the architecture that data from external sources -> some tools -> internal sources -> spark, we can however, directly do data from external sources -> spark
+ ![spark architecture](/assets/img/spark/spark-architecture.drawio.png) +- finally, sinks work in the same way in spark - they can be internal or external +- we use `DataFrameReader` to read from internal / external sources, which we obtain via `spark.read()` +- we specify the type using `format` +- we provide configuration using `option` +- we can also provide a mode, which determines the behavior when spark encounters a **malformed record**. it can be - + - **permissive** (default) - make all columns null and place the record in a new column + - **drop malformed** - ignore the malformed records + - **fail fast** - terminate the program +- schema - + - for file formats like csv, either it can be defined explicitly using `schema` (preferred), or it can infer the schema automatically (prone to errors) + - for file formats like avro / parquet, the schema is a part of the file format itself and therefore spark derives its schema from the file format itself +- so basically, while we can use `schema` for defining the schema explicitly, remember this is applicable only for formats like csv and json, so best case would be to avoid these file formats altogether and try and use parquet / avro formats where possible +- spark has its own data types, and they map to different types specific to the language we use, e.g. we can see how spark types map to java types [here](https://spark.apache.org/docs/3.5.0/sql-ref-datatypes.html#supported-data-types)
+ ![spark to java types](/assets/img/spark/spark-to-java-types.png) +- the last boolean flag specifies whether the field is nullable or not + ```java + StructType schema = new StructType(new StructField[] { + DataTypes.createStructField("FL_DATE", DataTypes.DateType, true), + DataTypes.createStructField("OP_CARRIER", DataTypes.StringType, true), + DataTypes.createStructField("ORIGIN", DataTypes.StringType, true), + DataTypes.createStructField("DEST", DataTypes.StringType, true), + DataTypes.createStructField("CANCELLED", DataTypes.IntegerType, true), + DataTypes.createStructField("DISTANCE", DataTypes.IntegerType, true) }); + + Dataset flightDf = spark.read() + .format("csv") + .option("header", "true") + .option("path", "src/main/resources/flight*.csv") + .option("dateFormat", "M/d/y") + .option("mode", "FAILFAST") + .schema(schema) + .load(); + + flightDf.printSchema(); + flightDf.show(); + ``` +- note how we specified the date format in configuration as well - for column specific configuration (e.g. if two different columns have differently formatted dates), maybe we can use `to_date` to convert from string to date type +- writing data - the default format used by spark is parquet if not specified +- the mode can be - + - **append** - append to the existing data + - **overwrite** + - **error if exists** + - **ignore** - write if location is empty, ignore otherwise +- so, when i use the code below - + ```java + flightDf.write() + .format("avro") + .mode("overwrite") + .option("path", "src/main/resources/output/sinks_demo") + .save(); + ``` +- i get the following output -
+ ![simple output](/assets/img/spark/simple-output.png) +- note - `df.write()` has return type of `DataFrameWriter` (recall `spark.read()` had return type of `DataFrameReader`) +- note - for spark to avro, i had to add following dependencies, since avro related dependencies are bundled separately from spark + ```xml + + org.apache.spark + spark-avro_${scala.version} + ${spark.version} + + + + + 2.15.3 + + com.fasterxml.jackson.core + jackson-databind + ${jackson-databind.version} + + ``` +- **partition by** - the code is as follows - `df.write().partitionBy("OP_CARRIER", "ORIGIN")` +- notice how it is chained to `DataFrameWriter` and not `Dataset` unlike [`repartition`](#repartition-and-coalesce) - so, my understanding - partition by is when we want to write the output and make it optimized for future jobs that might read this output. however, repartition can help with optimizing the current job itself +- the columns we partition on are not visible in the output files, because they are essentially part of the directory names! +- note how directories for origin are nested inside directories for carrier
+ ![partition by output](/assets/img/spark/partition-by-output.png) +- we can also chain `maxRecordsPerFile` to the `DataFrameWriter`, just like we chained `partitionBy`. it is useful when there are some partitions that become too big for spark to process. e.g. in the above example, if for carrier nw and origin den, the number of flights were too many, by using this option, this directory too will contain multiple files +- why use **bucketing** - since partitioning results in a unique directory for each value, partitioning by a column having too many unique values might not be a good idea, since it would result in too many directories (partitions) with too less data. so, we can instead use **bucketing** for columns having too many unique values +- how it works - we specify the number of buckets and the column to bucket using. then, spark will do hash(column_value) % number_of_buckets to get the bucket in which the row should be stored +- **sorting** - sorting can further improve the performance - e.g. if we had to perform joins, and the data is already sorted on the columns used for join, we can skip the sort phase in the shuffle join (described later) +- so, just like `partitionBy`, i chained the following to `DataFrameWriter` - + ```java + df.write() + .bucketBy(2, "OP_CARRIER", "ORIGIN") + .sortBy("OP_CARRIER", "ORIGIN") + .mode("overwrite") + .option("path", "src/main/resources/output/sinks_demo/") + .save(); + ``` +- however, i got the following exception on running the above - `'save' does not support bucketBy and sortBy right now` + +### Spark + Hive + +- so, we have to use `saveAsTable`. my understanding - till now, we were simply storing data as normal files, and they were accessible like a regular directory structure, but for bucketing and sorting, we need to bring in "database support" of spark +- my understanding - whatever we discuss in this part has been borrowed from hive +- since spark too has concepts of database and tables, there are two things spark needs to store - + - the **actual data** - this is what we have seen till now, when for e.g. we saw files being stored inside folders (i.e. partitions) + - the **metadata** - the table name, database name, etc. this is stored in something called **metastore**. by default, an in memory implementation is used i.e. the duration of this metastore is the same as the spark session +- there are two kinds of tables in spark - + - **managed tables** - spark will manage both the metadata and the actual data. by default, the actual data is stored inside `spark.sql.warehouse.dir`. when we for e.g. drop a table, both the metadata and the actual data get deleted + - **unmanaged tables** - also called **external tables**. spark will only manage the metadata. when creating a table, we specify the location of the actual data. useful when for e.g. the actual data already exists somewhere and is not managed by us. when we for e.g. drop a table, only the metadata is deleted, and the actual data is untouched +- managed tables are preferred - we can do optimizations like bucketing and sorting. with unmanaged tables, we have to rely on the existing data structure. we need unmanaged tables when we need to perform spark operations on already existing data +- my thought - one technique might be to port data from unmanaged to managed tables for better performance / more flexibility? this should ideally again be something spark can do - + - read from unmanaged tables + - perform some transformations like sorting and bucketing + - finally write to managed tables +- we need to add the spark + hive dependency + ```xml + + org.apache.spark + spark-hive_${scala.version} + ${spark.version} + + ``` +- then, chain the hive support in the spark session builder - + ```java + SparkSession spark = SparkSession.builder() + .master("local[*]") + .appName("TablesDemo") + .enableHiveSupport() + .getOrCreate(); + ``` +- now, we first create a database - (otherwise default would be used) + ```java + spark.sql("create database if not exists tables_demo"); + spark.catalog().setCurrentDatabase("tables_demo"); + spark.catalog().listDatabases().show(); + ``` +- output of list databases -
+ ![list databases](/assets/img/spark/list-databases.png) +- finally, we write a dataframe as follows. note - notice how we do not provide the path parameter, since it is managed table territory, therefore the `spark.sql.warehouse.dir` will be used, and we call `saveAsTable(db_name.table_name)` instead of `save()` + ```java + flightsDf.write() + .bucketBy(2, "OP_CARRIER", "ORIGIN") + .sortBy("OP_CARRIER", "ORIGIN") + .mode("overwrite") + .saveAsTable("tables_demo.bucketed_by"); + ``` +- output - notice how two new directories - for metadata (metastore_db) and for storing the actual data (spark_warehouse) are created. data is stored inside `<>.db/<>`
+ ![bucket by](/assets/img/spark/bucket-by-output.png) + +## Transformations + +- for transformations, we can either use spark functions like we do in sql, or we can use lambda accepting transformations like `groupByKey` +- for specifying columns in transformations, either use column_name directly as a string, or use `df.col("column_name")`. note - we cannot use both methods in the same transformation +- **udf** or **user defined functions** - register custom functions to use inside spark - + ```java + UserDefinedFunction parse_gender = udf((String gender) -> { + Pattern malePattern = Pattern.compile("^m$|^male$|^m.n$", Pattern.CASE_INSENSITIVE); + Pattern femalePattern = Pattern.compile("^f$|^female$|^wom.n$", Pattern.CASE_INSENSITIVE); + return malePattern.matcher(gender).find() ? "male" + : (femalePattern.matcher(gender).find() ? "female" : "unknown"); + }, DataTypes.StringType); + spark.udf().register("parse_gender", parse_gender); + + Dataset gendersDf = surveyDf + .select("Gender") + .withColumn("gender_cleaned", expr("parse_gender(Gender)")); + + gendersDf.show(); + ``` +- a function for adding a unique identifier to each record - `monotonically_increasing_id`. this number would be unique across all partitions but remember that it would not necessarily be continuous +- usual sql constructs like renaming using `alias`, changing data type using `cast`, etc are available +- `explode` - e.g. our record contains an array field. this field will ensure our result contains a record for each element of the array. e.g. our input has 2 elements in the array for the first record, and 3 elements in the array for the second record. the output will have 5 records + +### Example + +- input - + + | day | month | year | + |-----|-------|------| + | 28 | 1 | 2002 | + | 23 | 5 | 81 | + | 12 | 12 | 6 | + | 7 | 8 | 63 | + | 23 | 5 | 81 | + +- transformation - + ```java + Dataset cleanedDobDf = dobDf.withColumn("year_parsed", + when(col("year").leq(23), col("year").plus(2000)) + .when(col("year").leq(99), col("year").plus(1900)) + .otherwise(col("year"))) + .withColumn("date", concat_ws("/", col("day"), col("month"), col("year_parsed"))) + .withColumn("parsed_date", to_date(col("date"), "d/M/yyyy")); + ``` +- output - + + | day | month | year | year_parsed | date | parsed_date | + |-----|-------|------|-------------|------------|-------------| + | 28 | 1 | 2002 | 2002 | 28/1/2002 | 2002-01-28 | + | 23 | 5 | 81 | 1981 | 23/5/1981 | 1981-05-23 | + | 12 | 12 | 6 | 2006 | 12/12/2006 | 2006-12-12 | + | 7 | 8 | 63 | 1963 | 7/8/1963 | 1963-08-07 | + | 23 | 5 | 81 | 1981 | 23/5/1981 | 1981-05-23 | + +## Aggregations + +- **simple aggregations** - note the different aggregations carefully, e.g. difference between `count("*")` vs `count("Description")` + ```java + Dataset aggDf = inputDf.select( + count("*").alias("total_count"), + count("Description").alias("non_null_description_count"), + countDistinct("InvoiceNo").alias("unique_invoices"), + sum("Quantity").alias("total_quantity"), + avg("UnitPrice").alias("avg_unit_price")); + ``` +- **grouping aggregations** - we can also perform groupings using `groupBy` + ```java + Dataset aggByCountryAndInvoiceDf = inputDf + .groupBy("Country", "InvoiceNo") + .agg(count("Quantity").alias("total_quantity"), + round(sum(col("UnitPrice").multiply(col("Quantity"))), 2).alias("invoice_value")); + aggByCountryAndInvoiceDf.show(); + ``` +- note - when we chained `groupBy`, it returns a `RelationalGroupedDataset`, and when we again chained `agg` to it, it was converted back to our usual `Dataset` +- **window aggregations** - e.g. we need the running total by week for every country. three things to keep in mind for windowing aggregations - + - identify the **partitioning columns** - e.g. here, restart the running total for every country + - identify the **ordering of columns** - e.g. here, ensure that the data is ordered by the week number, week 3's running total = week 1's sale + week 2's sale + week 3's sale, and this is only possible when we order by week + - identify the **window bounds** - e.g. here, it starts at the first record and ends at the current record, like described in the week 3 example above +- example - note - for the bounds, we also have something called `unboundedFollowing`, but for our use case, `unboundedPreceding` and `currentRow` was enough + ```java + WindowSpec windowSpec = Window.partitionBy("country") + .orderBy("week_number") + .rowsBetween(Window.unboundedPreceding(), Window.currentRow()); + + Dataset outputDf = inputDf + .withColumn("running_total", sum("invoice_value").over(windowSpec)); + outputDf.show(); + ``` +- output would be automatically sorted by country and week number based on the window we specified, and it would have the running total column added to it, which automatically resets for every country + +## Joins + +- bringing "left" and "right" dataframe together +- we combine them using the **join expression** and the **join type** + ```java + Dataset orderWithProductsDf = orderDf.join(productDf, + orderDf.col("prod_id").equalTo(productDf.col("prod_id")), + "inner"); + orderWithProductsDf.show(); + ``` +- order schema - (order_id, prod_id, unit_price, qty) +- product schema - (prod_id, prod_name, list_price, qty) +- therefore, the joined table's schema - (order_id, prod_id, unit_price, qty, prod_id, prod_name, list_price, qty) +- note how the joined table's schema contains two columns for quantity + - one is from the product - it probably indicates in stock + - one is from the order - it indicates quantity of product ordered +- assume we wanted to select only some columns (only order's quantity, not product's quantity) - + ```java + Dataset orderWithProductsDf = orderDf.join( ... ) + .select("order_id", "prod_name", "unit_price", "qty"); + ``` +- we get the following exception - `[AMBIGUOUS_REFERENCE] Reference 'qty' is ambiguous, could be: ['qty', 'qty'].` +- how it works - we pass column names, internally spark converts to it the right identifier. when we pass qty, it probably finds two identifiers and hence gets confused +- some solutions - + - rename columns before joining (`withColumnRenamed`) + ```java + productDf = productDf.withColumnRenamed("qty", "product_qty"); + ``` + - drop one of the ambiguous columns - + ```java + Dataset orderWithProductsDf = orderDf.join( ... ) + .drop(productDf.col("qty")) + .select("order_id", "prod_name", "unit_price", "qty"); + ``` + - specify explicitly which dataframe's quantity to use - notice the end of the select clause + ```java + Dataset orderWithProductsDf = orderDf.join( ... ) + .select(col("order_id"), col("prod_name"), col("unit_price"), orderDf.col("qty")); + ``` +- outer joins - e.g. when we use an outer join like left, we might receive nulls for some columns. to get rid of the nulls, we can use `coalesce`, which will set the final value to the first non null value from the list it receives. e.g. below, we do a left join to get all the orders. prod_name comes from the product dataframe. if one of the products are missing, prod_name would be null. so, we tell spark to use prod_id of order dataset if prod_name of product dataset is missing + ```java + Dataset orderWithProductsDf = orderDf.join(productDf, + orderDf.col("prod_id").equalTo(productDf.col("prod_id")), + "left") + .withColumn("prod_name", coalesce(col("prod_name"), orderDf.col("prod_id"))) + .select(col("order_id"), col("prod_name"), col("unit_price"), orderDf.col("qty")); + ``` +- one small trick for clean code - note how we write each of the chaining all in one place. we can for e.g. extract some parts to variables like join conditions, some complex transformations, etc +- another technique might be to extract parts of logic to functions that accept and return dataframes. this also helps unit test these bits of logic +- there are two kinds of joining techniques used by spark - **shuffle join** and **broadcast join** + +### Shuffle Joins + +- imagine when we have two datasets with 3 partitions, and we have three executors +- so first a **shuffle** + **sort** happens to ensure that the same keys from both datasets belong to the same executor +- now, we can simply perform a **merge** to join the two datasets +- refer diagram [here](#jobs-stages-and-tasks) to recall breakdown of exchange and shuffle + sort +- small note - i am defaulting to thinking of shuffle joins as **shuffle sort merge joins**. there is another variation - **shuffle hash joins**, which is less optimal when compared to shuffle sort merge joins, so i am ignoring shuffle hash joins for now + +![shuffle join working](/assets/img/spark/shuffle-join-working.drawio.png) + +### Optimizing Shuffle Joins + +- reduce data being joined - because this shuffle and sort process is of course the bottleneck and can cause out of memory like issues, we should consider techniques like filtering (basically reducing) the amount of data we are joining, performing aggregations before joining, etc. basically, code intelligently +- maximize parallelism - the maximum parallelism possible when performing a join is the minimum of the three parameters below, so try maximizing the three parameters below - + - maximum number of executors our cluster allows + - `spark.sql.shuffle.partitions` - this determines the number of partitions after a shuffle i.e. after a wide transformation happens. the default value of this is 200. so, in case of join, this will give the number of partitions of the joined dataset? + - number of unique keys in the datasets involved in the join - handling **key skews** / **hot partitions** - discussed later +- **bucketed joins** - if the datasets are already bucketed and sorted using the keys involved in the join, we will not have to rely on the shuffling and sorting done by spark at all! the idea is to partition, sort and store the datasets in bucketed fashion before the join starts. we then load the datasets and perform the joins, and there would be no shuffle involved in the joins + +### Broadcast Joins + +- shuffle joins are used when we join two large datasets +- however, we can use broadcast joins when either (or maybe both) the datasets are small +- assume the smaller dataset can be stored inside one partition, while the larger dataset has 200 partitions +- if using the shuffle join technique, first all of the 200 + 1 partitions will be sent for shuffle and sort +- this means there is network transfer involved to first send the data to exchanges and then load it back in a sorted manner into the executors +- however, in the broadcast join technique, the partitions of the larger dataset can stay where they were, and the smaller dataset can be copied over to all the executors having the larger dataset partitions +- this way, we avoid having to move (shuffle) the larger dataset's partition over the network +- essentially, we are **broadcasting** the smaller dataset to all the executors having the larger dataset +- important - the driver and executor memory should be > than the size of the smaller dataset, so that it can fit inside the memory +- notice how below the larger dataset stays as it is unlike earlier where the larger dataset was sorted
+ ![broadcast join working](/assets/img/spark/broadcast-join-working.drawio.png) +- the threshold which decides when to use a broadcast join is `spark.sql.autoBroadcastJoinThreshold`, which is 10mb by default +- note - unlike shuffle join, broadcast join is a hash join always. recall in shuffle joins, we have the concept of both hash joins (rarely used) and sort merge joins +- we can also provide spark the hint to use broadcast join like so, if we are not happy with the defaults - + ```java + import static org.apache.spark.sql.functions.broadcast; + + Dataset joinedDf = flightsDf1.join(broadcast(flightsDf2), ... + ``` +- note - to confirm all this, go to http://localhost:4040/ -> Sql / Dataframe tab -> select the sql query +- we can tell from this if an exchange was involved or we skipped it by using bucketed joins, if shuffle join was used or broadcast join was used, etc + +## Spark AQE + +- **aqe** - **adaptive query execution** +- it includes optimizations discussed below +- set `spark.sql.adaptive.enabled` to true for this (it should be enabled by default in new versions), and rest of the optimizations discussed in this sections will be automatically enabled +- crazy granular configurations can be seen [here](https://spark.apache.org/docs/latest/sql-performance-tuning.html), use documentation for specific configuration, just learning things from theoretical perspective for now + +### Dynamically Deciding Shuffle Partitions + +- earlier, after a wide transformation, for e.g. group by, the number of output partitions from a stage would be = `spark.sql.shuffle.partitions` (default 200), but lets say we set it to 10 +- what if i only have e.g. 5 groups after a group by statement? +- spark would still create a total of 10 partitions, therefore 10 tasks in the subsequent stage +- now, our spark job would eat up the resources for 5 empty tasks as well +- remember that for a wide transformation, spark stalls all the tasks of its previous stage, so the empty tasks are just sitting idle +- this optimization by aqe resolves this issue +- spark will look at the number of unique groups, and then dynamically adjust the number of output partitions +- now, assume one of the partitions was relatively larger +- spark used one task for one partition of data +- spark would complete all the tasks except this one quickly +- again, remember that for a wide transformation, spark stalls all the tasks of its previous stage, so the tasks that get over quickly are just sitting idle +- this optimization by aqe resolves this issue as well +- spark would now also look at the number of records in each group +- spark can merge some partitions to be handled by one task +- so, since one task = one slot, that task would process multiple partitions of data one by one serially +- e.g. this way, our job ended up using only 4 slots optimally - this is better than for e.g. using 5 slots, out of which 4 would get over pretty quickly, since the 5th slot now can be allocated to some other job +- remember how this is different from [dynamic resource allocation](#dynamic-resource-allocation) - dynamic resource allocation changes the executors dynamically, while dynamically deciding shuffle partitions changes the number of output partitions and what partition goes to what executor dynamically +- so, recap - two optimizations - + - determine the number of shuffle partitions dynamically + - dynamically coalesce the smaller shuffle partitions + +![aqe shuffle partitions](/assets/img/spark/aqe-shuffle-partitions.drawio.png) + +### Dynamically Switching Join Strategies + +- we already know that broadcast joins are more optimal than the regular shuffle joins +- however, assume one of the tables have a lot of complex transformations before being involved in the join +- spark may not be able to decide whether or not to use broadcast join, and would default to using shuffle join +- however, with aqe enabled, spark can **after shuffling** decide to go for a broadcast join +- the optimization here is that while we are involved in the shuffle process (therefore the network transfer) of the shuffle join, we still get rid of the sort and merge process, which is more expensive than a simple broadcast join + +### Dynamically Optimizing Skew Joins + +- if for e.g. we are joining two tables, and we have a **hot / skewed partition** +- before aqe, number of partitions / tasks = number of unique keys involved in the joins +- after aqe, spark is intelligent enough to break the hot partition into smaller chunks +- now, these smaller chunks can be processed in parallel in different tasks +- thus, we will not have an overly sized task (and thus out of memory exceptions) anymore +- note how we had 3 tasks without aqe, but now have 4 tasks with aqe +- note how the partition of the smaller dataset is copied + +![aqe skew joins](/assets/img/spark/aqe-skew-joins.drawio.png) + +## Dynamic Partition Pruning + +- it is enabled by default +- first, recall [partition pruning](#execution-plan--catalyst-optimizer-working) +- it is usually used for efficiency gains in a star schema design, so thats the lingo used in this section +- e.g. we have sql like below - + ```sql + select * + from fact join dimension + on fact.dimension_id = dimension.id + where dimension.some_attribute = 'xyz' + ``` +- my understanding of constraints needed for dynamic partition pruning - + - should be a broadcast join (dimension table would be broadcasted) + - fact table should be partitioned using dimension_id + - and of course, dynamic partition pruning should be enabled +- now, how this join would work is - + - the dimension table would be filtered using some_attribute = 'xyz' + - the filtered dimension table would be broadcast everywhere + - spark would be intelligent enough to only load the partitions of the fact table where dimension_id is present in the ids of the filtered dimension table + +## Caching + +- two methods for caching - chain `cache()` or `persist()` on the dataframe +- `cache` will cache using the default storage and memory, and does not allow configuration +- `persist` allows for more configuration around storage + - use disk or memory or a combination of both + - when storing in disk, data would of course be serialized, but when storing in memory, we can either store it in deserialized format or serialized format. serialized format advantage - would be compact therefore acquire less space. serialized format disadvantage - it would need to be serialized before storing / deserialized after reading from memory, hence it would use cpu + - use replication +- the default for persist / cache is memory + disk, with deserialization for memory and no replication +- both cache and persist are lazy like transformations - they are only triggered once there is an action + ```java + Dataset cachedDf = df.cache(); + cachedDf.count(); + ``` +- spark need not cache all partitions, it would only cache the partitions based on the actions we use, e.g. if we use `take(10)`, it would just cache the first partition, since the first partition should be self sufficient in providing with 10 records. however, if for e.g. we used an action like `count()`, it would have to cache all partitions +- however, spark will always either cache the entire partition or nothing, it will never cache a portion of the partition +- when to cache - when we use the same dataframe in multiple actions +- to evict from cache, chain `unpersist()` on the dataframe + +## Repartition and Coalesce + +- **repartition** - the code is as follows - `partitionedDf = df.repartition(3)` +- when we try to write this repartitioned dataframe, the output looks like follows - + ![repartition output](/assets/img/spark/repartition-output.png) +- note - above, we saw `repartition(number)`, but we can also use `repartition(columns...)` or `repartition(number, columns...)` +- when we do not specify a number to repartition and just column names, the number of partitions created = `spark.sql.shuffle.partitions` +- so basically, the number of partitions in repartition = either specified by us in the function call, or set via `spark.sql.shuffle.partitions`, and the column used for this partitioning can be specified by us as well +- when to use repartition - to improve performance, but we should be absolutely sure, since repartition would cause a shuffle +- when we are reducing number of partitions, do not use repartition, use `coalesce` +- `coalesce` will only collapse the partitions on the same worker node, thus avoiding a shuffle sort +- so my guess - if for e.g. we call `coalesce(10)`, but the data was on 11 worker nodes, total number of partitions finally would be 11? + +## Hints + +- we can add hints related to partitioning and joins +- hints - no guarantee that they would be used +- in the dataframe api, we can either use spark sql functions, or use `dataframe.hint()` +- join hint example using both techniques - + ```java + df1.join(broadcast(df2)) + df1.join(df2.hint("broadcast")) + ``` +- partitioning hint example - `df.hint("coalesce", 4)` +- note - i don't think there is any difference between chaining coalesce directly vs using it as a hint +- when writing the same using sql, there is a special [comment syntax](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-hints.html#examples) we use + +## Shared Variables + +- these were both primarily used in rdd apis, but can have a niche use case in dataframe world as well + +### Broadcast Variables + +- **broadcast variables** use case - e.g. our udf uses some static reference data +- the reference data is for e.g. 5-10 mb, i.e. too big to store in plain code +- so, we can for e.g. store it in a file, and broadcast it to all the nodes +- this way, this variable can then be used inside the udf +- my understanding - maybe we can use closure as well i.e. we store the data in like a variable outside the udf, and then access it in the udf +- disadvantage of using closure - if for e.g. we have 1000 tasks running on 30 nodes, there would be 1000 deserializations. in case of broadcast variables however, there would only be 30 deserializations +- example - + ```java + SparkSession spark = // ... + Broadcast broadcastVar = spark.sparkContext().broadcast(new int[] {1, 2, 3}); + broadcastVar.value(); // can be used inside a udf. it returns [1, 2, 3] + ``` +- note - a better technique could also have been to somehow load this reference data as a dataframe if possible + +### Accumulators + +- **accumulators** are like a global variable that we can update +- e.g. from our udf, we would like to update a variable based on some condition +- so, these variables can be updated on a per row basis +- these variables basically live in the driver, and the executors internally communicate with the driver to update this variable +- example - + ```java + SparkSession spark = // ... + LongAccumulator accum = spark.sparkContext.longAccumulator(); + numberDf.foreach((x) -> accum.add(1)); + accum.value(); // should print the count of rows + ``` +- note - there is no shuffle etc involved in this process of realizing the final value of the accumulator - it is being mutated inside the driver by the executor communicating the changes to the driver +- so, these accumulators can either be updated from transformations like udf, or actions like forEach like we saw +- however, understand - if we use accumulators from within for e.g. udf, the value of accumulator can go bad - e.g. if a task fails, the executor will retry it - the accumulator cannot discard the partial changes made to it via the failed task, since there are too many concurrent modifications happening on it already via other tasks +- however, this does not happen when using an accumulator from inside actions like `forEach` + +## Spark Speculation + +- can be enabled via `spark.speculation`, false by default +- example we have 10 tasks, and all of them complete under 2 seconds, but one of them takes 10 seconds +- spark will automatically identify the slow running tasks and run a duplicate copy of this task +- this way, whichever one of the two finishes faster is used by spark, and the other task is killed +- useful when for e.g. the original task was running slow due to a fault in the worker node that it was running on, which was causing it to be slow +- running speculative tasks does have overhead in terms of resources +- e.g. if there are data skews or out of memory issues in our application, spark would still run copies of this task (which too will run slow or maybe fail) without realizing that the root cause is actually the data / faulty configuration itself + +## Streaming Introduction + +- earlier convention was batch processing - data first comes and sits in the lake +- then, there would be jobs that can be run for e.g. daily to perform the processing +- however, with time, jobs started demanding for smaller and quicker batches +- the idea is not to schedule the jobs in smaller intervals +- instead, we start viewing data as a stream that is in motion and not at rest +- spark streaming is an extension of the dataframe apis +- spark uses **micro batches** for achieving stream processing +- spark automatically takes care of lot of challenges like start and end time of batches, intermediate state management, etc +- initially, spark used **dstreams** - built on top of rdd +- now, sparks offers **structured streaming apis** - built on top of dataframe apis i.e. supports sql +- additionally, **event time semantics** are supported by structured streaming apis as well, which were not available in the d stream apis +- word count example using netcat - notice how for reading data, `read()` changed to `readStream()`, but otherwise, everything else stays the same. `readStream()` returns a `DataStreamReader` (recall read used to return `DataFrameReader`) + ```java + SparkSession spark = SparkSession.builder() + .master("local[*]") + .appName("Streaming Demo") + .getOrCreate(); + + Dataset lines = spark.readStream() + .format("socket") + .option("host", "localhost") + .option("port", "9999") + .load(); + ``` +- data from the socket comes in a column `value`. we want to split each line into its constituent words, and create a separate row for each word + ```java + Dataset wordCount = lines.select(explode(split(col("value"), " ")).alias("word")) + .groupBy("word") + .count(); + ``` +- finally, we try writing it to the console. again, `write()` changes to `writeStream()`. writeStream returns a `DataStreamWriter` (recall write used to return a `DataFrameWriter`) + ```java + StreamingQuery streamingQuery = wordCount.writeStream() + .format("console") + .option("checkpointLocation", "checkpoint") + .outputMode("complete") + .start(); + streamingQuery.awaitTermination(); + ``` +- note - we used `streamingQuery.awaitTermination()` above to simulate running an application indefinitely, and we got streamingQuery from the result of writing to a streaming sink +- note - sinks terminate when application is stopped / due to some error condition +- however, what if were writing to multiple sinks? + - we can use `spark.streams().awaitAnyTermination()`, when any of the streaming sinks terminate + - remember to have multiple checkpoint locations - do not use the same checkpoint location for multiple streaming sinks +- start the netcat utility using `nc -lk 9999`, and run the app to see the streaming output in the console +- working - first, spark creates an optimized logical plan, just like it did in case of dataframes +- now, it would create a job that reads from the source, processes it and finally writes it to the sink +- underneath, spark runs a background thread +- based on our trigger configuration, a new spark job is created. so, a spark job will not be created at every interval, it would only be created based on our trigger configuration, and all this is taken care of us by a background thread + ![spark streaming jobs](/assets/img/spark/spark-streaming-jobs.png) +- **trigger** determines how often to trigger the micro batch +- the default is **unspecified**. trigger a micro batch immediately, but stall this current micro batch until there is some input in the source +- trigger can also be based on for e.g. **time interval** - if the previous micro batch exceeds the time limit, the new batch starts after the previous batch finishes. however, if the previous micro batch finishes before the specified time limit, the new batch would wait till the mark reaches the time. for this, just chain the below to the `writeStream()` + ```java + .trigger(Trigger.ProcessingTime("1 minute")) + ``` +- finally, trigger can also be **continuous** - this is an experimental feature, where the performance is even faster than the current micro batch approach +- some popular streaming sources / sinks - netcat (already seen above), file and kafka +- the file source is capable of monitoring the path for new files. it can also use archival i.e. move the processed files to a different directory / delete the processed files altogether +- so, only sinks available are kafka, file and console for streaming requirements. how to for e.g. use jdbc? we can use `forEachBatch`, which is maybe called for every micro batch? - + ```java + outputDf.writeStream().foreachBatch((df, batchId) -> { + df.write() + .format("xyz") + // ... + .save(); + }); + ``` +- output modes - + - **append** - like insert only. used when previous outputs are not affected + - **update** - like upsert i.e. either new records are added or old records are updated + - **complete** - overwrite the complete result every time +- update vs complete example - + - input -
+ ![streaming input](/assets/img/spark/streaming-input.png) + - complete -
+ ![streaming output complete](/assets/img/spark/streaming-output-complete.png) + - update -(look at batch 2 in particular)
+ ![streaming output update](/assets/img/spark/streaming-output-update.png) +- append does not make sense with aggregations like count, so it would throw an error like this - `Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;`. the why - this is because append means immutable - the other two output modes - complete and update have some way of reflecting updates made to previous groups, but append cannot allow for updating of existing groups, only creating of new groups. now maybe how aggregations work in spark streaming - spark receives a record, decides which group this record should belong to, and updates that group. this updating is not allowed in append mode, hence append mode does not support aggregations +- a spark streaming application is like a web server i.e. keeps running unlike when submitting batch jobs to spark +- even a streaming application will stop at least at some point due to reasons like some failure, some maintenance, etc +- so, we need to be able to handle this stopping and restarting gracefully +- **gracefully** = **exactly once processing** +- exactly once processing basically means neither should we end up reading an input twice, nor missing an input record +- this is what **checkpoint location** helps achieve +- checkpoint location maintains things like - + - what was the input boundaries of the last micro batch + - state information (e.g. running total of the word count) +- we just saw how checkpoints helps spark achieve exactly once processing. however, exactly once processing also depends on sources and sinks - e.g. source should be replayable i.e. allow reading of old messages. e.g. using kafka / files as streaming sources allows for this. similarly, sinks should be idempotent i.e. it should recognize duplicates instead of adding duplicates to the data +- what if our application has a bug? - we fix the spark code, we rerun spark-submit. now, can we rely on check pointing to continue the job from where it left off after the job was stopped and restarted? + - yes, if our fix was something like filter out malformed records + - no, if our fix changed the aggregation strategy etc, since maybe it messes up the checkpoint state altogether + +## Streaming Using Kafka + +- add the following dependency - + ```xml + + org.apache.spark + spark-sql-kafka-0-10_${scala.version} + ${spark.version} + + ``` +- use the following to establish a connection - + ```java + Dataset kafkaSourceDf = spark.readStream() + .format("kafka") + .option("kafka.bootstrap.servers", "localhost:9092") + .option("subscribe", "invoices") + .load(); + ``` +- when we try printing the schema - `kafkaSourceDf.printSchema();`, we get the following - + ``` + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = true) + |-- partition: integer (nullable = true) + |-- offset: long (nullable = true) + |-- timestamp: timestamp (nullable = true) + |-- timestampType: integer (nullable = true) + ``` +- the value is in binary format. here is how to extract all fields into dataframe friendly format + - assume we create the schema of the payload somewhere + - then, we can cast the value field to a string + - then, call from_json on it, which also needs the schema + - this means all our data would be available as a struct type under the attribute value + - finally, based on [this](https://stackoverflow.com/a/54433013/11885333), i chained a `.select`, so that i do not have to access fields using value.attribute, but just using attribute - + + ```java + Dataset flattenedDf = kafkaSourceDf + .select(from_json(col("value").cast("string"), schema).alias("value")) + .select("value.*") + ``` +- [this doc](https://kafka.apache.org/quickstart) is great for debugging when writing kafka related code - creating topics, publishing to topics using kafka-producer, consuming from kafka-consumer, etc +- now, when we try `flattenedDf.printSchema();`, we get the right schema which we can use in our transformations +- to understand - how does kafka + spark actually work i.e. does spark rely on offset committing logic of kafka, or does spark itself maintain the offset inside the checkpoint directory +- writing to kafka - while reading from kafka, we deserialized the value attribute. while writing to kafka, we need to convert our dataframe into two fields of key and value + - combine all fields into a struct + - convert this field to json + - rename this condensed field to value + - pick any other attribute to act as key + + ```java + .select( + to_json(struct("*")).alias("value"), + col("InvoiceNumber").alias("key")); + ``` + +## Streaming Transformations + +- **stateless transformations** - do not need to maintain state across micro batches. e.g. filter, map, flatMap, explode, etc +- **stateful transformations** - need to maintain state across micro batches. e.g. for computing totals etc as we process new records, the state needs to be stored as a part of the checkpoint. e.g. grouping, aggregations +- now, stateless transformations do not support complete output mode. think why - + - if our streaming transformations are only stateless, 10 input records would contain 10 output records + - this means we will have to include input records as a part of the output every time + - this means all records need to be stored in the state, which is not efficient for spark +- so, as a side effect - we can run into out of memory issues when using spark streaming due to excessive state. spark stores all this state inside memory for efficiency +- it also stores it in the checkpoint location so that for e.g. when the application dies / is stopped due to some reason, it can resume from where it left off +- so, we have two concepts - **time bound state** and **unbounded state** +- **time bound state** - e.g. we calculate a weekly running total. spark knows that it can get rid of records older than a week,since they do not contribute to the total. this is also called **managed state**, since spark can manage this state +- **unbounded state** - there is no time bounds we can specify for the state. therefore, we ourselves need to specify some kind of cleanup logic for the state, so that our application does not encounter out of memory issues. this is also called **unmanaged state**, since the cleanup logic is on us to implement + +## Window Aggregations + +- this is the time bound state / managed state that we talked about above +- **trigger time** - determines when a micro batch starts and ends +- **event time** - the actual time when the event occurred +- important - the bounds of the **window** we specify has nothing to do with the trigger time +- the window we specify uses the event time to decide which window the record should be a part of +- spark also handles **late events** - e.g. we get an event for 10.00-10.15 when we have already performed processing for 10.15-10.30 and 10.30-10.45 +- e.g. we create a window of 15 minutes - + - this basically means a new column called window of type struct would be added to our dataset, with two fields - start and end + - spark will automatically decide for us which of these groups a record belongs to, based on the column name we specify. this column acts as the event time - e.g. created time in this example + - since this is basically inside a group, we can specify more columns to group on. e.g. we specify type column in the group by clause. then, we get windows for each of the type separately + - finally, we perform an aggregation - all records where type is buy, have their amount attribute added to total buy, all records where type is sell, have their amount added to total sell + - so basically think about whats in state of spark - for all groups i.e. windows, spark is storing the computed aggregate and updating it as and when new records arrive + - confusion, note - remember how this window is so much more different than the windowing aggregation we saw earlier - there, there was no grouping or aggregation involved - based on our specification, we were automatically able to add a new column for running total + + ```java + Dataset outputDf = stockSourceDf + .groupBy(window(col("CreatedTime"), "15 minute")) + .agg( + sum(when(col("Type").equalTo("BUY"), col("Amount")).otherwise(lit("0"))).alias("TotalBuy"), + sum(when(col("Type").equalTo("SELL"), col("Amount")).otherwise(lit("0"))).alias("TotalSell")); + ``` + +- remember - spark had to maintain old windows inside its state as well, to help it with late events +- **watermark** - helps expire old window state, so that out of memory etc exceptions are not caused. remember how this is the biggest advantage of using managed state +- so, we need to decide how late can an event be, post which - + - we can simply ignore the event + - we can clean up the state for that window +- for this, we simply need to chain the `withWatermark`. note - + - chain it before the group by clause + - column name used for windowing and column name specified inside watermark should be the same + + ```java + .withWatermark("CreatedTime", "30 minutes") + .groupBy(window(col("CreatedTime"), "15 minute")) + ``` +- how should the cleanup happen? - all windows with end_time < (max_event_time - watermark) can be ejected from state (note - max_event_time i think means event with maximum time in the micro batch). e.g. say our watermark is 30 minutes, and we receive a record with event time = 10.48. all windows with end time before 10.48 - 30 = 10.18 would be ejected from the spark state. this is the managed state / automatic cleanup that we were talking about in time bound state +- watermark and complete output mode do not make sense together - spark cannot cleanup state if it has to output all the records for every micro batch +- recall how we had talked about append mode not working when we have group by etc in our streaming jobs, because append cannot update groups. however, think about watermarks - when the max_event_time - watermark moves, all windows with ends below this line can be closed. hence, when we introduce watermarks and windows with aggregations, spark supports append mode. all windows which have been declared closed by spark are output after the micro batch gets over +- summary of the difference between output modes when using watermark + windowing - + - complete - output all windows, ignore watermark concept + - update - output all windows which were updated by the micro batch, eject all windows from state which are declared stale by spark via watermark concept + - append - eject all windows from state and only output windows which have been declared stale by spark via watermark concept, do not output all windows that were updated like update output mode +- **tumbling windows** vs **sliding windows** - + - tumbling windows do not overlap, while sliding windows can have an overlap + - my understanding - in tumbling windows, window duration = sliding interval, whereas in sliding windows, both are unequal + - in tumbling windows, an event can be a part of only one window. in sliding windows, an event can be a part of multiple windows, e.g. 10.18 can be a part of 10.10-10.20 and 10.15-10.25 + - so, the only difference in syntax is we now pass two parameters - window duration and sliding window size + + ```java + .groupBy(window(col("CreatedTime"), "15 minute", "5 minute")) + ``` + +## Streaming Joins + +### Streaming to Static + +- commonly used for stream enrichment +- stateless - spark does not have to maintain any state - this is because every time we get an event, we can simply compute the rows it produces as a result of the join and output these results, since they would not change / the event would not be needed for computing future joins anymore +- for each micro batch, spark is smart enough to refresh the static dataframe i.e. imagine when the application is already running, we insert new data into the static dataframe underlying source, e.g. jdbc. spark will reload the static dataframe with the new data when a new event comes in for the streaming dataframe +- inner join is supported +- left outer join is possible when the streaming dataframe is on the left. why - assume right outer join was allowed. spark would have to predict for the static dataframe's record whether or not a row is present in the streaming dataframe. this cannot be concluded, since streams grow infinitely. this is why right (and full) outer joins are not supported + +### Streaming to Streaming + +- stateful - we need to maintain both sides of data forever in the state, unlike when joining streaming dataframe to static dataframe. remember how this is stateful, but streaming to static can be stateless +- we can solve this problem using 🥁 `withWatermark`. specify a watermark on both streams being joined, so that spark can remove events that are stale +- inner join is supported +- left outer join is possible but with some limitations, TODO +- TODO: spark interview question of memory diff --git a/_posts/2024-01-18-elasticsearch.md b/_posts/2024-01-18-elasticsearch.md new file mode 100644 index 0000000..dd1dd96 --- /dev/null +++ b/_posts/2024-01-18-elasticsearch.md @@ -0,0 +1,1367 @@ +--- +title: Elasticsearch +--- + +## Introduction + +- elasticsearch is open source +- we interact with elasticsearch using rest api and json, making it easy to work with +- elasticsearch is written in java and uses apache lucene underneath +- row in rdbms - **documents** in elasticsearch +- columns in rdbms - **fields** in elasticsearch +- table in rdbms - **index** in elasticsearch +- **index templates** - apply settings and mappings to indices that match a pattern + +## Use Cases + +- used for implementing search functionality, by addressing common problems like + - filtering search results - e.g. filter products based on category, price range, brand, etc + - sort results based on relevance - e.g. most reviewed, similarity with search parameters, etc +- we can aggregate the data stored in elasticsearch while querying. so, using elasticsearch data for analytics and not at all for searching is a perfectly valid use case +- apm or application performance management - e.g. analyze logs, monitor system metrics, etc +- machine learning - + - forecast future values - e.g. predict sales + - anomaly detection - e.g. alert when number of visitors on our website suddenly drops + +## Elastic Stack + +- elasticsearch - the heart of the elastic stack which stores the data +- kibana - + - serves as a web interface for configuration etc + - visualize the data stored in elasticsearch by creating dashboards in kibana + - note - kibana stores its data in elasticsearch. this means a new kibana instance pointing to our existing elasticsearch instance will automatically load all the configuration, dashboards, etc +- logstash - traditionally for processing logs and sending to elasticsearch. now, it has evolved into a more general purpose data processing tool, to perform etl +- x pack - add additional features like - + - authentication and authorization to elasticsearch and kibana + - monitoring - monitor performance of components of elasticsearch, logstash, kibana, etc and set up alerting based on issues related to these components + - machine learning + - graph - e.g. suggest relevant songs. popular != relevant. e.g. if 10 users use google, it is just because google is a very commonly used search engine, but if 10 users use stack overflow, it indicates something common between them. it helps us look for "uncommonly common" features + - sql - we typically use elasticsearch's query dsl to query elasticsearch, but we can also use sql, which gets translated to the query dsl bts. this can help people used to sql to get started with using elasticsearch +- beats - light weight agents installed on servers which then ship data to elasticsearch / logstash. e.g. file beats for sending log files, metric beats for system level metrics like memory and cpu usage, etc + +## Setup + +- download elasticsearch from [here](https://www.elastic.co/downloads/elasticsearch) +- download kibana from [here](https://www.elastic.co/downloads/kibana) +- run `./bin/elasticsearch` to run elasticsearch. it will display the following - + - enrollment token - helps kibana communicate with elasticsearch securely + - password - `pU-z6IdUirqzzUsFVlWh` for me +- run `./bin/kibana` to run kibana. we need to do the following - + - it would display the kibana url with a code as query parameter. open it + - enter the enrollment token displayed in the elasticsearch console + - authenticate using username as `elastic` and password as what is displayed in the elasticsearch console +- to interact with elasticsearch + - in kibana using dev tools - + - `get _cluster/health` to view cluster's health + - `get _cat/nodes?v` to view all nodes + - `get _cat/indices?v` to view all indices + - using curl - + ```sh + curl --cacert elasticsearch-8.12.0/config/certs/http_ca.crt \ + -u elastic:pU-z6IdUirqzzUsFVlWh \ + https://localhost:9200/ + ``` + +## Architecture + +- **node** - an instance of elasticsearch +- each node belongs to a **cluster** +- we can have different clusters based on use cases, e.g. one cluster for search, a different cluster for apm, etc + +## Sharding + +- elasticsearch uses **sharding** to help it scale +- sharding - splitting an index into smaller chunks +- this way, we are not limited by the storage capacity of 1 node +- sharding is done at index level for flexibility, because some indices can be very large, while others very small +- because of sharding, we can scale the cluster horizontally instead of having to do it vertically +- underneath, each shard is independent, like a fully functionally index. actually, each shard is a lucene index underneath +- sharding also helps parallelize the elasticsearch queries we issue, since the query can be broken down and run on each shard in parallel +- so two advantages - scale storage and improve throughput +- for elasticsearch < 7.0.0 - + - default number of shards was 5 - thus leading to **over sharding** when there were many small indices in the cluster + - changing number of shards after creating an index was not possible - to increase the number of shards, people would create a new index with the correct number of shards and move over the documents manually +- for newer versions of elasticsearch - + - default number of shards is 1 + - we can increase / decrease the number of shards using the **split** / **shrink** api, and elasticsearch does the heavy lifting for us bts + +## Replication + +- the underlying nodes / hardware / storage in a cluster can easily fail +- introducing **replication** for fault tolerance in elasticsearch is very easy +- replication is also configured at index level +- copies of shards are created, called **replica shards** +- the shard that has been replicated is called the **primary shard** +- all of the shards together are called a **replication group** +- primary and replica shards are never stored on the same node, because that defeats the purpose +- so, if our cluster has only one node, no replica shards are added even if we set replication +- replicas can also serve as read replicas +- this means if we have three shards (one primary and two replicas), there can be three search requests that can be served in parallel +- so two advantages (just like sharding) - serve as standby and improve throughput +- default replication is 1 +- use `get _cat/shards?v` to view all shards. it gives us which index it belongs to, its type (primary or replica), which node it is stored on, etc + +## Snapshots + +- helps take backups +- we can take snapshots of specific indices or of the entire cluster +- it helps us restore the state to a specific point in time + +## Node Roles + +- **master** - + - the master node in a cluster performs cluster wide actions like creating and deleting indices + - if there are several nodes with this role, one of them are elected as the master + - larger clusters should have "dedicated masters" so that they do not perform high io tasks like serving search requests +- **data** - + - enables it to store shards + - thus, it can perform query / modification of data on the shards that it is responsible for +- **ml** - + - lets a node run machine learning jobs + - `xpack.ml.enabled` needs to be enabled as well on the node +- **coordination** - + - node can be responsible for distributing the queries and then aggregating the data results + - can be accomplished by disabling all other roles on the node, there is no direct role available in elasticsearch for this +- **voting only** - + - can participate in the election of a new master, but not be elected as the master itself + +## Simple CRUD + +- deleting an index - `delete pages` +- by default when we call `put index_name`, we get two shards by default - one primary and one replica shard +- this is why my cluster running locally goes into yellow health after creating an index - since i was running one elasticsearch node and one of the replicas shards are still unassigned +- specify settings when creating an index - + ``` + put products + { + "settings": { + "number_of_shards": 2, + "number_of_replicas": 0 + } + } + ``` +- we can index a document like below. it would return us the auto generated id for it + ``` + post products/_doc + { + "name": "Coffee Maker", + "price": 64, + "in_stock": 10 + } + ``` +- for a custom id, the endpoint above could have been like so - + ``` + post products/_doc/100 + ``` +- retrieving a product if we have the id - + ``` + get products/_doc/100 + ``` +- now, if we for e.g. run the below "for the same id" again, the older document is "replaced" with this new document + ``` + post products/_doc/100 + { + "name": "Red Shoes" + } + ``` +- note - we looked at two variations - `post <>/_doc` for automatic ids, and `post <>/_doc/<>` for custom ids and create or update (basically replace). there are many more variations, not bothering right now +- elasticsearch documents are immutable - when we call post using the same id again, elasticsearch will basically create a new document and re index this document, effectively replacing the older document +- **scripted updates** - update using code, instead of us first retrieving the value, deciding the new value and then updating it. this approach for e.g. reduces network calls made to elasticsearch. we can do things like set the operation to delete if the in stock value becomes 0 etc. skipping this for now, as i would likely use an orm + +## Routing + +- **routing** - helps resolve the shard for a document +- basically - shard = hash(_routing) % number_of_primary_shards +- by default, _routing = id +- so, when we try performing crud operations using the id, this is how the shard resolution happens inside elasticsearch +- underneath, issues like skewed shards etc are prevented by elasticsearch automatically +- this is why changing the number of shards for an index on the fly is difficult - the shard of the existing documents might change as the number of shards change. for us as developers however, using the shrink and split api is much easier now in newer versions of elasticsearch + +## Working of Reads + +- the request reaches the coordinating node +- by the formula discussed in routing, it determines which primary shard is responsible for this document +- then, it directs the read request to the best replica shard in the replication of group of the primary shard +- the replica is chosen using a strategy called **ars** or **adaptive replica selection**, which is deemed best for performance +- finally, the response reaches the client back from the coordinating node + +## Working of Writes + +- the request reaches the coordinating node +- by the formula discussed in routing, it determines which primary shard is responsible for this document +- now, the request is sent to the primary shard, unlike in reading document where the request was sent to any replica using ars +- the primary shard validates the document - e.g. throw an error if a string value is being specified for a numeric value +- then it indexes the document +- now, it sends requests to its replica shards in parallel +- finally, the write is complete and the response is sent back to the client via the coordinating node + +## Conflicts During Writes + +- what if primary shard goes down after receiving a write? a replica shard would be promoted but what if the write was already committed to some other replica shards and not the newly appointed primary shard? +- what if a replica shard goes down during a write? +- many such failure scenarios can happen in distributed systems like this +- all these problems are handled by using primary term, sequence number and checkpoint in elasticsearch +- **primary term** - how many times the primary shard has changed +- **sequence number** - a counter that is incremented for each write operation. i think it is index specific +- **global checkpoint** - the minimum sequence number all the shards in the replication group have been aligned up to +- **local checkpoint** - the sequence number the current shard is at +- my understanding - the values of primary term and sequence number are also assigned to the documents to help with optimistic concurrency control +- **optimistic concurrency control** - what if an older version of document overwrites a newer version i.e. when writes happen concurrently? this situation is common, given the distributed nature of elasticsearch. e.g. two visitors on our e-commerce app try decreasing the in stock attribute by one simultaneously +- in newer versions, we are supposed to send the primary term and sequence numbers discussed earlier in order to implement optimistic concurrency control - + ``` + post products/_update/100?if_primary_term=1&if_seq_no=9 + // ... + ``` +- note, my understanding - apart from primary term and sequence numbers, when we retrieve documents, they also return a version. it is just like we would expect a version column to work, i.e. increments by one. it was used for implementing optimistic concurrency control in older versions, but the newer and preferred method is to use the primary term and sequence numbers instead that is descried above + +## Bulk Operations + +- these are much more efficient than sending out individual requests +- we can use the `_update_by_query` and `_delete_by_query` variants, where we specify the match clause +- the operations i think work like `update ... where ...` and `delete ... where ...` +- we can also use the bulk api to perform multiple kinds of operations on an index all at once +- this format that we use is also called nd json +- example of using bulk api inside kibana - + ``` + post products/_bulk + { "index": { "_id": 200 } } + { "name": "Espresso Machine", "price": 150, "in_stock": 4 } + { "create": { "_id": 201 } } + { "name": "Espresso Machine", "price": 150, "in_stock": 4 } + { "update": { "_id": 202 } } + { "doc": { "name": "Espresso Machine", "price": 150, "in_stock": 4 } } + { "delete": { "_id": 100 } } + ``` +- one line specifies the action (index, create, update, delete), the second line specifies the document contents (except in delete) +- my understanding - index vs create vs update - index works for both create and update, update fails when no document exists and create fails when document already exists +- we can also specify the primary term and sequence numbers inside the action line for optimistic concurrency control +- using curl to upload data using bulk api, where a file has all the data in the form of nd json - + ``` + curl -H "Content-Type: application/x-ndjson" \ + -XPOST \ + --cacert ~/elasticsearch-8.12.0/config/certs/http_ca.crt \ + -u elastic:pU-z6IdUirqzzUsFVlWh \ + https://localhost:9200/products/_bulk \ + --data-binary "@products-bulk.json" + ``` + +## Working of Bulk Operations + +- first, the query reaches the coordinating node as usual +- a snapshot of the entire index is taken +- then, the query to search for the documents and the bulk request to update them is sent to all the nodes +- if a failure occurs during this update, the failures are sent back to the client and "not rolled back" +- understand how this might be different from e.g. rdbms systems where there are features like transactions which help rollback +- idea is instead of rolling back, elasticsearch sends the failures to the client so that the client can handle it accordingly +- why was the snapshot taken - this helps elasticsearch implement optimistic concurrency control internally - it is not unlikely that since bulk request is huge, and during the processing of this bulk request, some document gets updated in the intermediary. so, elasticsearch uses this snapshot to compare the primary term and sequence number of the document it updates + +## Analysis + +- values are **analyzed** when indexing documents, to help with searching them +- different data types in elasticsearch will use different data structures e.g. numeric and geo spatial data might be stored inside bkd trees. however, most of them are fairly straightforward like in for e.g. rdbms, unlike text data types, what elasticsearch is known for. so, that is the focus here +- **analyzer** consists of three building blocks - character filters, a tokenizer and token filters +- **character filters** - + - add / remove / transform characters + - an analyzer can have multiple character filters, and they would be run one after another + - e.g. the html_strip character filter will filter out the html entities + - input - `I'm REALLY liking beer` + - output - I'm REALLY liking beer +- **tokenizer** - + - split into tokens + - an analyzer contains exactly one tokenizer + - some characters e.g. punctuations can be removed as a part of this + - important - the offset for each token is recorded as well. useful e.g. for [match phrase queries](#full-text-queries) + - e.g. the standard analyzer splits based on special characters + - input - "I'm REALLY liking beer" + - output - ["I'm", "REALLY", "liking", "beer"] +- **token filters** - + - add / remove / modify tokens + - an analyzer can have multiple token filters, and they would be run one after another + - e.g. lowercase filter to make all tokens lowercase + - input - ["I'm", "REALLY", "liking", "beer"] + - output - ["i'm", "really", "liking", "beer"] +- **standard analyzer** - the default. it uses no character filters, the standard tokenizer and finally the lowercase token filter +- there is an easy to use api, where we specify the analyzer / its components, and elasticsearch returns us the analyzed result + ``` + post _analyze + { + "text": "2 guys walk into a bar, but the third... DUCKS! :-)" + } + + post _analyze + { + "text": "2 guys walk into a bar, but the third... DUCKS! :-)", + "analyzer": "standard" + } + + post _analyze + { + "text": "2 guys walk into a bar, but the third... DUCKS! :-)", + "char_filter": [], + "tokenizer": "standard", + "filter": ["lowercase"] + } + ``` +- output is as follows - + ```json + { + "tokens": [ + { "token": "2", "start_offset": 0, "end_offset": 1, "type": "", "position": 0 }, + { "token": "guys", "start_offset": 2, "end_offset": 6, "type": "", "position": 1 }, + { "token": "walk", "start_offset": 7, "end_offset": 11, "type": "", "position": 2 }, + { "token": "into", "start_offset": 12, "end_offset": 16, "type": "", "position": 3 }, + { "token": "a", "start_offset": 19, "end_offset": 20, "type": "", "position": 4 }, + { "token": "bar", "start_offset": 21, "end_offset": 24, "type": "", "position": 5 }, + { "token": "but", "start_offset": 26, "end_offset": 29, "type": "", "position": 6 }, + { "token": "the", "start_offset": 30, "end_offset": 33, "type": "", "position": 7 }, + { "token": "third", "start_offset": 34, "end_offset": 39, "type": "", "position": 8 }, + { "token": "ducks", "start_offset": 43, "end_offset": 48, "type": "", "position": 9 } + ] + } + ``` +- we saw above how elasticsearch constructs tokens using a three step process +- **inverted index** - a mapping between the tokens and what documents contain these tokens +- e.g. finding which documents contain a specific term is as simple as looking up the term in the inverted index +- inverted indices are scoped to a "field of an index" +- **keyword** data type - used for exact matching. e.g. status field +- for full text searches, use the **text** data type instead +- internally, keyword uses the **keyword analyzer**, which is no op i.e. does not do anything +- in the inverted index that is created for keyword data type, the key is the entire string, the values are the documents having it +- in the inverted index that is created for text data type, the keys are the tokens, the values are the documents having it, along with offsets etc to help with for e.g. `match_phrase` query +- elasticsearch comes in with a lot of built in character filters, tokenizer and token filters, and we can mix and match them +- elasticsearch also comes in with a lot of [built in analyzers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html) which we can use. they are configurable as well, e.g. we can add the stop word to the standard analyzer +- two common token filters - + - **stemming** - reducing words to their root form. e.g. if the word in the description is "loved", and the client searches for "loves", they should still be able to search for the word. stemming helps reduce the word to its "root form" + - **stop words** - common words in a language that are filtered out when a field is analyzed. e.g. articles +- note - what we search for is analyzed in the same way as the attribute! e.g. if the word drinking in the document is stemmed to drink, the word drinks in the query is also stemmed to drink +- below is an example of creating a custom analyzer inside an index. notice the four sections inside analysis - character filter, tokenizer, filter (token filter is called filter) and finally analyzer - + ``` + put analyzer_test + { + "settings": { + "analysis": { + "char_filter": { }, + "tokenizer": { }, + "filter": { + "danish_stop": { + "type": "stop", + "stopwords": "_danish_" + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "standard", + "filter": [ + "lowercase", + "danish_stop", + "asciifolding" + ] + } + } + } + } + } + ``` + +## Mapping + +- **mapping** defines the structure of documents +- like a schema in rdbms +- two approaches - + - **explicit mapping** - we specify the fields and their data types ourselves + - **dynamic mapping** - the field mapping is automatically created for us when elasticsearch encounters a new field +- [data types](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html) available in elasticsearch +- creating an explicit mapping - + ``` + put reviews + { + "mappings": { + "properties": { + "rating": { "type": "float" }, + "content": { "type": "text" }, + "product_id": { "type": "integer" }, + "author": { + "properties": { + "first_name": { "type": "text" }, + "last_name": { "type": "text" }, + "email": { "type": "keyword" } + } + } + } + }, + "settings": { + "number_of_shards": 2, + "number_of_replicas": 0 + } + } + ``` +- retrieving the mapping for an index - + ``` + get reviews/_mapping + ``` +- when relying on dynamic mapping e.g. for strings, first, using [type coercion](#type-coercion), it would try converting it to a number / date. if that fails, the default behavior is to use [multi field mappings](#multi-field-mappings), so that text is used for attribute, and keyword is used for attribute.keyword. e.g. - + ``` + put recipes/_doc/1 + { + "ingredients": ["potato", "tomato"] + } + + get recipes/_mapping + ``` +- output - + ``` + { + "recipes": { + "mappings": { + "properties": { + "ingredients": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + } + } + } + } + } + ``` +- this default behavior might not be ideal for us since it consumes a lot of disk space, e.g. for ingredients, we would rarely perform text searches while for description of recipes, we would rarely perform aggregations, sorting, etc +- we can ask elasticsearch to disable dynamic mapping using - + - `"dynamic": "strict"` - would not allow unknown fields when indexing a document + - `"dynamic": "false"` - would allow additional fields but not analyze them. they would just be stored and be a part of the _source in the response + + ``` + put people + { + "mappings": { + "dynamic": "strict", + "properties": { + "first_name": { "type": "text" } + } + } + } + ``` + +### Changing the Mapping + +- changing the mapping might not be easy. e.g. assume we want to go from numeric to keyword data type. this is not easy for elasticsearch, since it would have to re index all the existing documents, since the underlying structure itself changes from a bkd tree to an inverted index (keyword data type uses keyword analyzer) +- so, we can use the re index api, which copies over our documents from the source to the destination index. while doing this, we specify the script, which can do some conversions for us. the syntax / working is similar to scripted updates which we mentioned earlier + ``` + post _reindex + { + "source": { "index": "reviews" }, + "dest": { "index": "reviews_new" }, + "script": { + "source": """ + if (ctx._source.product_id != null) { + ctx._source.product_id = ctx._source.product_id.toString(); + } + """ + } + } + ``` + +## Object vs Nested Data Types + +- when we use **object** data type, internally, elasticsearch flattens it using **dot notation** +- e.g. assume we had a document like below i.e. we set the field type of reviews to be an object + ```json + { + "product": { + "manufacturer": { + "name": "xyz" + }, + "reviews": [ + { "author": "abc", "rating": "4.7" }, + { "author": "def", "rating": "3" } + ] + } + } + ``` +- how elasticsearch kind of views them internally - + ```json + { + "product.manufacturer.name": "xyz", + "product.reviews.author": ["abc", "def"], + "product.reviews.rating": ["4.7", "3"] + } + ``` +- based on above, there is a downside of using the type object - if we search for a review left by abc and with rating 3, the current document we showed above would be returned - even though abc left 4.7. this is because after the flattening done by elasticsearch internally, the correlation between the fields of an object was lost +- therefore, due to the shortcomings above, we can use the **nested** data type. this means that all the fields of that structure would be correlated +- nested data type works in a fundamentally different way compared to object data type - internally, a new document is created for each of the review - so, if we were to index a document with 10 reviews, internally 11 documents would be indexed by elasticsearch. there is no flattening inside the same document like in object data type +- assume we had an array of objects. we can create mapping for nested type as follows - + ``` + // ... + "reviews": { + "type": "nested", + "properties": { + "author": { "type": "text" }, + "rating": { "type": "float" } + } + } + ``` +- for object data type, we just need to omit the `"type": "nested"` line +- having two many nested objects in the array can slow down queries etc, but this might be an indicator of a bad design in the first place as well. there are limits on the maximum number of fields allowed inside a nested document, maximum number of nested objects allowed in the array, etc as a safeguard + +## Arrays in Elasticsearch + +- there is no concept of arrays in elasticsearch - any field can contain 0 or more values in elasticsearch by default + ``` + post products/_doc/100 + { + "tags": ["electronics"] + } + + post products/_doc/100 + { + "tags": "smart phone" + } + + get products/_doc/100 + ``` +- in case of text fields, values of array type are simply "concatenated" one after another + ``` + post _analyze + { + "text": ["hello", "world"] + } + ``` +- output - make note of the offset + ```json + { + "tokens": [ + { "token": "hello", "start_offset": 0, "end_offset": 5, "type": "", "position": 0 }, + { "token": "world", "start_offset": 6, "end_offset": 11, "type": "", "position": 1 } + ] + } + ``` +- empty array / skipping the field mean the same thing +- i don't think this is the same as explicitly providing null however + +## Date Data Type + +- internally, elasticsearch stores dates as milliseconds since epoch, by converting it into the utc timezone +- if we do not specify the format, we can specify it in iso-8601 format (the one that looks like `2024-01-21T04:25:21.139Z`) or a number, that is the milliseconds since epoch +- however, when creating the explicit mapping, we can also specify the format using the java date format + ``` + "purchased_at": { + "type": "date", + "format": "dd/M/yy" + } + ``` + +## Type Coercion + +- **type coercion** - if we provide `"price": "7.4"` instead of `"price": 7.4`, elasticsearch is smart enough to convert it to the float type instead of treating it as keyword, string, etc +- in the example below + - we first create a document (the index is created automatically), and ensure that the dynamic mapping has type number for price. if we started with string itself, then of course the dynamic mapping would create it using text + keyword type + - then, second and third calls go through due to type coercion and how [arrays](#arrays-in-elasticsearch) in elasticsearch work, while the fourth call fails because it cannot be coerced + + ``` + post coercion_test/_doc/100 + { + "price": 7.4 + } + + // the mapping shows price is of type float + get coercion_test/_mapping + + // does not throw an error + // even though we provide a string + post coercion_test/_doc/101 + { + "price": "7.4" + } + + // does not throw an error + // even though we provide an array containing strings and numbers + post coercion_test/_doc/101 + { + "price": ["7.4", 8.9] + } + + // will throw an error + post coercion_test/_doc/102 + { + "price": "7.4m" + } + ``` +- when retrieving the document, we see "7.4" and not 7.4! maybe while elasticsearch does analyze the fields, it will ultimately just return us what we provided it with in the first place +- notice how this is a recurring theme, we saw it in [mapping](#mapping) when using `"dynamic": false` as well - _source is the exact same as the input by user, but bts, other processes like coercion, analyzing based on data type, etc are carried out +- to avoid all the hassle with type coercion, we can just disable it as well when creating the index + ``` + put sales + { + "settings": { + "index.mapping.coerce": false + } + // ... + } + ``` + +## Multi Field Mappings + +- e.g. assume we want a field to have both type keyword and text +- problem statement + - aggregations etc can not be run on text data type, but can be run on keyword data type + - searching etc can not be run on keyword data type, but can be run on text data type +- e.g. we have a recipes index, and we would like to use ingredients for searching (use case for text data type) and for aggregations like popular ingredients (use case for keyword data type) +- so, elasticsearch allows us to specify multiple data types for a field +- e.g. below, text related data structures would be created for ingredients, while keyword related data structures would be created for ingredients.agg. so, when querying elasticsearch, we would use the same convention as well i.e. use ingredients when we want to use the text based queries but use ingredients.agg for keyword based queries + ``` + put recipes + { + "mappings": { + "properties": { + "ingredients": { + "type": "text", + "fields": { + "agg": { + "type": "keyword" + } + } + } + } + } + } + ``` +- recall how when relying on dynamic mapping, this is the default i.e. using attribute for text data type and attribute.keyword for keyword data type +- other use cases might be for e.g. to optimize a field using different analyzers for different use cases + +## Elastic Common Schema + +- **ecs** or **elastic common schema** +- how common fields should be mapped +- e.g. doesn't matter the source of event - redis, kafka, nginx, etc, the "event timestamp" should be mapped via `@timestamp` field +- it is more useful for standard events like from web servers, databases, etc, not for custom use cases like using a product index + +## Term Level Queries + +- **term level queries** - term level queries are not analyzed, it is not like a part of it should match, the entire thing should match +- it does not make sense to use term level queries with text data type. it is typically used for all other data types like keyword, numbers, etc. this is because term level queries are not analyzed, while text data type is analyzed. it just does not make sense to do so, even if we get some hits +- e.g. of term level query. recall how [dynamic mapping](#mapping) created [multi field mapping](#multi-field-mappings) for both text and keyword. so, since we want to use the keyword variant, we use the below (term level queries are not meant for text data types) - + ``` + get products/_search + { + "query": { + "term": { + "tags.keyword": "Vegetable" + } + } + } + ``` +- specifying multiple terms to match based on - + ``` + get products/_search + { + "query": { + "terms": { + "tags.keyword": ["Vegetable", "Meat"] + } + } + } + ``` +- we retrieved document by id using + ``` + get products/_doc/100 + ``` +- retrieve documents by multiple ids + ``` + get products/_search + { + "query": { + "ids": { + "values": ["100", "200"] + } + } + } + ``` +- **range searches** - useful for fields of type dates, numbers, etc + ``` + get products/_search + { + "query": { + "range": { + "in_stock": { + "gte": 1, + "lte": 6 + } + } + } + } + ``` + +### Flexibility in Term Level Queries + +- while term level queries are not analyzed, they do allow for some flexibility described in this section +- still, do not forget the rule of thumb - term level queries are not analyzed, and therefore are not meant to be used for text data types +- **case insensitive** - will match documents having a tag vegetable / Vegetable. notice how the structure changes a little bit, `tags.keyword` is not a string now like earlier, but an object, with the value specified under `value` + ``` + get products/_search + { + "query": { + "term": { + "tags.keyword": { + "value": "vegetable", + "case_insensitive": true + } + } + } + } + ``` +- **prefix** - begins with. will match documents having name both "Pasta" and "Pastry", but not "Red Pasta" + ``` + get products/_search + { + "query": { + "prefix": { + "name.keyword": { + "value": "Past" + } + } + } + } + ``` +- **wildcard** - can use `?` / `*`. `past?` will match "paste", `past*` will match "pasta" and "pastry" however, do not do `*past`. while it will work, it might be very slow if index is huge + ``` + get products/_search + { + "query": { + "wildcard": { + "name.keyword": { + "value": "Past*" + } + } + } + } + ``` +- **regexp** - allows for regular expressions, useful when use case is more complex than what wildcard can do. remember, i get slightly confused in other places as well - `past*` is wildcard, `past.*` is regular expression. just like in wildcards, only try using it for prefix matching + ``` + get products/_search + { + "query": { + "regexp": { + "tags.keyword": { + "value": "Bee(f|r)" + } + } + } + } + ``` +- below the value, for all types like regexp, wildcard, prefix, etc, we can additionally also specify `case_insensitive` + +### Exists Term Level Query + +- search for all documents where a tag exists + ``` + get products/_search + { + "query": { + "exists": { + "field": "tags.keyword" + } + } + } + ``` +- what basically happens in exists query - it looks for all documents that are present in the inverted index +- there can be many reasons why a document would not be present in an inverted index, some common ones are - + - we specify null + - we specify an [empty array](#arrays-in-elasticsearch) - recall this is the same as omitting the field + - if for e.g. we use the `ignore_above` parameter, and the value was too long and was thus not indexed - recall this is usually keyword not text, so the entire string would be used for the inverted index and not the tokens, in which case it might have stayed below the character limit + +## Full Text Queries + +- term level queries are used for exact searches on structured data +- **full text queries** are used for searching through unstructured data +- the query is analyzed - if the field is analyzed, the same analyzer is used, else the standard analyzer is used +- analyzing both the query and the actual query using the same analyzer is key - otherwise, finding the document in the inverted index would not be possible +- like term level queries should be used for any data type but text +- full text queries should be used for only text data types +- querying all documents - + ``` + get products/_search + { + "query": { + "match_all": {} + } + } + ``` +- e.g. search for a particular field - note how `case_insensitive` is not needed like in term level queries, since the standard analyzer already contains the lowercase filter + ``` + get products/_search + { + "query": { + "match": { + "name": "PAsTa" + } + } + } + ``` +- if we specify multiple words, e.g. below, we get all products having either pasta **or** chicken in their name + ``` + get products/_search + { + "query": { + "match": { + "name": "pasta chicken" + } + } + } + ``` +- this is because the default operator is or. we can however change it to and as below. notice how the structure changes a little bit, `name` is not a string now like earlier, but an object, with the value specified under `query` + ``` + get products/_search + { + "query": { + "match": { + "name": { + "query": "pasta chicken", + "operator": "and" + } + } + } + } + ``` +- **multi match** - match using multiple fields i.e. either name or tags should have vegetable + ``` + get products/_search + { + "query": { + "multi_match": { + "query": "vegetable", + "fields": ["name", "tags"] + } + } + } + ``` + +### Controlling Scores in Full Text Queries + +- **relevance scoring** - typically in term level queries, the score is just 1, so this concept is not present there. this is not true in full text queries though. e.g. in the or variant of pasta chicken example discussed above, the recipes having both pasta and chicken come before recipes having either of them. this is because recipes containing both are deemed more relevant by elasticsearch +- **relevance boost** - e.g. boost the score of recipes having vegetable in its name. notice how everything is almost the same except the caret symbol + ``` + get products/_search + { + "query": { + "multi_match": { + "query": "vegetable", + "fields": ["name^2", "tags"] + } + } + } + ``` +- by default, the score and therefore the sorting happens using the "best matching field". e.g. assume a recipe has vegetable both in its name and its tag. if the name above leads to a score of 12.3 and tags lead to a score of 3, "the final score is not 15.3, but 12.3". we can change this behavior by specifying for e.g. **tie breaker**. so, its like the default value of tie breaker is 0. if we specify for e.g. 0.3, the final score = field_with_highest_score + (0.3 * (sum_of_scores_of_other_fields)). so, all other fields will contribute 0.3 of its score, while the field with the highest score will contribute its entire value + ``` + get products/_search + { + "query": { + "multi_match": { + "query": "vegetable", + "fields": ["name^2", "tags"], + "tie_breaker": 0.3 + } + } + } + ``` + +### Full Text Queries - Match Phrase + +- **match phrase** - a phrase is a sequence of one or more words. till now, the examples we saw did not consider the order of the words, e.g. if we search for "chicken pasta", "pasta xyz chicken" and "chicken pasta" should have the same score. using match phrase, words should appear in the "correct order" and "one after another". e.g. if we search for "complete guide", "a complete and useful guide" would not match. this why [offsets](#analysis) was stored as a part of analysis in the first place. again since it is a full text query, the field would be analyzed using the same analyzer used for field, and all recipes having juice and mango in its name one after another would match + ``` + get products/_search + { + "query": { + "match_phrase": { + "name": "juice (mango)" + } + } + } + ``` +- but, what if we want to allow for e.g. "spicy tomato sauce" to match "spicy sauce"? +- so, we can add the **slop** parameter to the query as follows - + ``` + get proximity/_search + { + "query": { + "match_phrase": { + "title": { + "query": "spicy sauce", + "slop": 1 + } + } + } + } + ``` +- when we say slop = 1, it basically means that the term can moved around once. we can move sauce +- lets say slop is 2. this means we are allowed two moves. in this case, "spicy sauce" will also match "sauce spicy" + + | **original** | spicy | sauce | | + | **slop 1** | | spicy, sauce | | + | **slop 2** | | sauce | spicy | + +- **edit distance** is another synonym for this concept +- e.g. of building a naive search - use a [bool query](#compound-queries---bool) + - `must` can use `match` with spicy sauce - recall how by default, or operator would be used + - `should` can use `match_phrase` with spicy sauce, and have some slop as well, to help boost documents with spicy sauce close by + +## Compound Queries - Bool + +- we can combine **leaf queries** to form complex **compound queries** +- we can have multiple nested compound queries +- **must** - must be present +- **must not** - must not be present +- **should** - their presence is not mandatory, but they help boost the relevance scores +- e.g. look for alcohol, not wine, and we are particularly interested in beer. note - while we do not provide multiple queries, each of must, must not and should is an array, thus allowing for multiple term level / full text queries + ``` + get products/_search + { + "query": { + "bool": { + "must": [ + { + "term": { + "tags.keyword": "Alcohol" + } + } + ], + "must_not": [ + { + "term": { + "tags.keyword": "Wine" + } + } + ], + "should": [ + { + "multi_match": { + "query": "beer", + "fields": ["name", "description"] + } + } + ] + } + } + } + ``` +- a special note - if we do not have must / must not clauses and only the should clause, one of all the queries inside should "should" match (no pun intended). maybe because if this was not the case, technically all documents of the index would be a part of the result set, and would just have different relevance scores. so for e.g. if we wanted to model scenarios like "or", we can just specify them inside the should query. at least one of the conditions inside or (i.e. inside should) would match, and documents specifying more number of conditions in the or clause would be ranked higher +- recall how we said should only affects the scoring if either must or must not is present. we can change this behavior by providing **minimum should match** clause +- **filter** - will just filter documents. unlike must, it would not contribute to the relevance score. e.g. if we are just looking for all products of type alcohol, we do not need it to contribute to the relevance score +- **filter execution context** - filter execution context does not contribute to the relevance score. thus, it is more optimal. additionally, queries inside the filter execution context can be cached for higher performance. e.g. must not and filter +- **query execution context** - contributes to the relevance score. thus, slower and cannot be cached. e.g. must and should + +## Compound Queries - Boosting + +- **boosting** - e.g. we want the functionality of should, but it should reduce the relevance score +- what we specify inside **positive** has to match (like must of bool) +- the scores of documents that match what we specify inside **negative** is reduced (opposite of should in bool) +- "the factor" by which we want the score to be reduced can be specified via **negative boost** +- e.g. "i want" juice, but i "do not like" not apple - + ``` + get products/_search + { + "query": { + "boosting": { + "positive": { + "match": { + "name": "juice" + } + }, + "negative": { + "match": { + "name": "apple" + } + }, + "negative_boost": 0.2 + } + } + } + ``` +- e.g. i like pasta, but not bacon. so, both are optional, unlike above where juice was mandatory. so, we need to combine both boosting (for its negative) and bool (for its should). additionally, notice how we use match_all inside must of bool (if only should is present, it would become mandatory) + ``` + get products/_search + { + "query": { + "boosting": { + "positive": { + "bool": { + "must": [ + { + "match_all": {} + } + ], + "should": [ + { + "term": { + "tags.keyword": "Pasta" + } + } + ] + } + }, + "negative": { + "term": { + "tags.keyword": "Bacon" + } + }, + "negative_boost": 0.2 + } + } + } + ``` + +## Compound Queries - Disjunction Max + +- **disjunction max** - we can specify multiple queries +- the query with highest relevance score is the one that is used ultimately +- we can however, use a **tie breaker** for the other matches +- recall how the working of this is exactly like [multi match](#full-text-queries). there, we specify multiple fields, here we specify multiple queries +- in fact a multi match query is converted into a dis max query. multi match query - + ``` + get products/_search + { + "query": { + "multi_match": { + "query": "vegetable", + "fields": ["name", "description"], + "tie_breaker": 0.7 + } + } + } + ``` +- equivalent dis max query - + ``` + get products/_search + { + "query": { + "dis_max": { + "queries": [ + { "match": { "name": "vegetable" } }, + { "match": { "description": "vegetable" } } + ], + "tie_breaker": 0.7 + } + } + } + ``` + +## Nested Queries + +- if we have nested objects, dot notation works just fine +- recall how we should use nested type and not object type if we want correlation between the different fields for an array of objects +- how to search through an array of nested type - + ``` + get recipes/_search + { + "query": { + "nested": { + "path": "ingredients", + "query": { + "bool": { + "must": [ + { + "match": { + "ingredients.name": "parmesan" + } + }, + { + "range": { + "ingredients.amount": { + "gte": 100 + } + } + } + ] + } + } + } + } + } + ``` +- how the score of the matching child documents effect the score of the parent document is determined via **score mode**. it is average by default (average of the scores of all the matching child documents), but it can be changed to min, max, sum. we just need to add `"score_mode": "max"` to the `nested` object for this +- if we add the **inner hits** parameter, we get all the nested documents that matched, with what score etc. understand that by default, we will get only one score which is for the parent document. this parameter helps us dig deeper into what nested document matched, with what score, etc. we just need to add `"inner_hits": {}` to the `nested` object for this + +## Controlling Query Results + +- specify format using ?format. can be for e.g. yaml +- use ?pretty if using curl so that the json response is properly formatted. response when using kibana is anyways always formatted, this is more when using for e.g. shell + ``` + curl --cacert ~/elasticsearch-8.12.0/config/certs/http_ca.crt \ + -u elastic:7Nb_iz3DKsvOgWirudWq \ + -XGET "https://localhost:9200/_cat/nodes?format=json&pretty" + ``` +- we can specify `_source` key to decide what attributes the result should return. by default, the entire document is returned. use case - we only need the ids and want to fetch the original data from another source. it is like projection in sql. set it to false for just the ids, or specify the attributes to include / exclude +- control the number of results returned using the `size` parameter. the default is 10 - + ``` + get products/_search + { + "size": 100, + "query": { + "match_all": {} + } + } + ``` +- to implement pagination, we can implement the offset using `from` - + ``` + get products/_search + { + "size": 1, + "from": 2, + "query": { + "match_all": {} + } + } + ``` +- implementing pagination + - total_pages = ceil(total_hits / page_size) + - `size` is page size + - `from` is page_size * (current_page - 1) +- sorting results - default is sorting by score. also note that sorting by name would throw an exception like - `Text fields are not optimized ...`, so use name.keyword. recall that [default dynamic mapping](#mapping) would generate [multi field mapping](#multi-field-mappings) for strings, with both text and keyword variant + ``` + get products/_search + { + "size": 10, + "from": 2, + "sort": [ + { "price": "desc" }, + { "name.keyword": "asc" } + ], + "query": { + "match_all": {} + } + } + ``` +- assume field is multi valued (elasticsearch does not care if a field is an [array](#arrays-in-elasticsearch)). we can then inside the sort array, structure the object like so - + ``` + { + "price": { + "order": "desc", + "mode": "avg" + } + } + ``` + +## Metric Aggregations + +- **metric aggregations** - calculate metric like sum, average, etc on the field we specify. e.g. - + ``` + get orders/_search + { + "size": 0, + "aggs": { + "total_sales": { + "sum": { + "field": "total_amount" + } + } + } + } + ``` +- `total_sales` is the name of the aggregation, inside which we specify the type of aggregation e.g. `sum`, and inside `sum`, we provide the name of the field to perform the aggregation on +- we set the size to 0 because retrieving the documents is of no use to us. i the case above, just all documents would be fetched / based on size specified +- similarly, we can use `avg`, `min`, `max`, `cardinality` (for distinct values) +- note - we can also specify `query` etc to filter out the documents on which we want to perform the aggregation +- so, to get the number of documents on which our aggregation was performed, we can use `value_count` +- we can use `stats` for a summary that includes aggregations like min, max, etc + +## Bucket Aggregations + +- **bucket aggregations** - we calculate aggregations for buckets of documents. documents fall into a bucket, and aggregations are not calculated for a specific field like in metric aggregation +- my brain cannot remember this syntax, so just understand and remember the idea for reference, but refer docs for the actual syntax +- there are many more bucket aggregations like [range](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-range-aggregation.html) for custom ranges of numbers / dates, [histogram](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-histogram-aggregation.html) to automate this bucket creation, etc, refer documentation based on use case + +### Term in Bucket Aggregation + +- [term](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html) - based on the field we specify, it will dynamically create the buckets for us. e.g. log level for logs, status for order like below, etc + ``` + get orders/_search + { + "size": 0, + "aggs": { + "status_agg": { + "terms": { + "field": "status" + } + } + } + } + ``` +- this helps us get different buckets for each order status, where each bucket contains the number of documents present in it +- additionally, to get the documents which have for e.g. the status field set to null / do not have the status field, we can add the following inside `terms` above - + ``` + "missing": "N/A", // + "min_doc_count": 0 // + ``` +- `missing` helps set name of bucket with documents containing missing status field to "N/A" +- why set `min_doc_count` - the bucket would not be returned if no faulty documents are present. setting it to 0 helps ensure even buckets with 0 documents are returned +- note - bucket aggregations are not always accurate. when our query reaches the coordinating node, it asks each shard for the top 10 documents. now, the count of status pending can be in top 10 of the first shard, but not necessarily in the second shard. so, all of the pending orders might not be present in the bucket once the coordinating node aggregates the result from both first and second shard. solution - increase the size parameter so that the default of 10 is not used. issue - it will effect performance + +### Nested in Bucket Aggregations + +- unlike metric aggregations, bucket aggregations allow for nesting +- in fact, we can nest a metric aggregation inside a bucket aggregation as well +- e.g. below, we will have stats like min, max, etc for each bucket. we create bucket using term discussed above + ``` + get orders/_search + { + "size": 0, + "aggs": { + "status_agg": { + "terms": { + "field": "status" + }, + "aggs": { + "status_stats": { + "stats": { + "field": "total_amount" + } + } + } + } + } + } + ``` + +### Filter in Bucket Aggregations + +- [filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-filter-aggregation.html) - e.g. i want the avg price of all sales, and i also the average price for sales of t-shirt + ``` + get /sales/_search?size=0&filter_path=aggregations + { + "aggs": { + "avg_price": { "avg": { "field": "price" } }, + "t_shirts": { + "filter": { "term": { "type": "t-shirt" } }, + "aggs": { + "avg_price": { "avg": { "field": "price" } } + } + } + } + } + ``` +- response will contain both the average price of t-shirt's sales and average price of all sales +- remember - if we for e.g. wanted just the average sales of t-shirts, we would run the below i.e. a query will filter the documents then the aggs would only run on the filtered documents + ``` + get /sales/_search?size=0&filter_path=aggregations + { + "query": { "term": { "type": "t-shirt" } }, + "aggs": { + "avg_price": { "avg": { "field": "price" } } + } + } + ``` + +### Filters in Bucket Aggregations + +- [filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-filters-aggregation.html) - helps us perform aggregations on custom buckets +- e.g. max length of log for errors and warnings + ``` + get logs/_search + { + "size": 0, + "aggs": { + "messages": { + "filters": { + "filters": { + "errors": { "match": { "body": "error" }}, + "warnings": { "match": { "body": "warning" }} + } + }, + "aggs": { + "max_length": { + "max": { + "field": "message_length" + } + } + } + } + } + } + ``` + +## Kibana + +- open source ui to visualize elasticsearch data +- it also stores its data inside elasticsearch itself, thus helping us avoid issues around backups, easily scale kibana horizontally, etc +- dashboards are dynamic as well with interactivity +- **data views** - + - was called **index patterns** in the past + - we specify an index pattern here, and all indexes matching this pattern will be queried by kibana + - e.g. for logs, it is typical to have one index per month to help with scaling, as having all the data inside one index might not scale well + - optionally, we can also set the timestamp field when configuring a data view which helps filter the data in dashboards by time +- kibana has different apps like apm, maps, dashboard, etc +- **kql** or **kibana query language** - quickly put together some querying to filter documents instead of the verbose elasticsearch's query dsl. kql is internally converted to the equivalent elasticsearch query dsl. some tips - + - simply type some words to search for them in all fields - `products fischer` + - search for the exact phrase by surrounding using double quotes - `"products fischer"` + - search for documents with specific values for a field using operators - `http.response.status_code : 400` + - `:` is for equals, we can also use `>`, `<=`, etc + - we can have multiple conditions and combine them using boolean operators like `and` and `or` + - invert condition using `not` + - make groups using parentheses `()` to for e.g. avoid relying on the default precedence + - we can use wildcards for values - `url.path : /brands*` +- kibana -> discover for browsing through index data as is. it is meant for adhoc analysis of data + 1. data view - select the data view to use + 2. kql - enter kql + 3. time - enter time range, absolute or relative. recall the timestamp field we set when creating the data view. this is a common feature in different apps across kibana + 4. histogram - based on our time range, elasticsearch will automatically create the histogram. e.g. since my time range was of three days, it generated buckets of an hour, and shows number of documents inside each bucket + 5. fields - by default all fields are displayed. e.g. to only see values of certain fields in 6., we can select the fields here + 6. messages - the actual documents (log messages in this case) + + ![discover](/assets/img/elasticsearch/discover.drawio.png) + +- note about time - throughout the ui, time is in our timezone, but when queries are sent from kibana, they are sent in utc format + - why is it good - e.g. if i want to see the logs in the last 5 hours. i will simply see the values in my timezone / query using my timezone, without having to care about the conversion myself + - when it can be a problem - i want to see the logs for 1st of january. ist of january can mean different times in different timezones. so, i might want to adjust the times in my query / change the timezone itself in kibana +- to create visualizations - from the sidebar, go to analytics -> visualize library -> create visualization + - my understanding - the kql and time filters at the top are available at the top - the kql we enter is a part of the visualization, but the timestamp we select is not +- we configure from the right pane +- the left pane shows us the actual values +- in the right pane, there are two sections + - metrics - for the actual metric - average of total in this case + - buckets - for the parameter to bucket based on - product category in this case +- metric visualization example -
+ ![metrics](/assets/img/elasticsearch/metrics.png) +- when doing nested aggregations, a typical example can be - + - bucket using date histogram and timestamp field + - create the sub aggregation using [term](#term-in-bucket-aggregation) and e.g. status code field +- in visualizations like line chart etc, we can select the bucket type to be **split series** or **split chart**. split series will show for e.g. in the same chart, whereas split chart will create separate charts in the same visualization +- the order of aggregations might matter sometimes - e.g. if i want the date histogram of the top 5 most accessed url paths throughout - + - if i first bucket by date and then by url path, kibana would show the top 5 urls for every interval + - however, if i reverse this order of bucketing, i get the right output +- example of line chart. notice the configuration on the right, with the right ordering explained above
+ ![line chart](/assets/img/elasticsearch/line-chart.png) +- note - bar, area and line chart are configured in the same way, they are just different to look at +- recall [filters](#filters-in-bucket-aggregations) in bucket aggregations. we use kql to specify custom buckets
+ ![filters](/assets/img/elasticsearch/filters.png) +- note - for above use case, we could have used range as well, but terms might be useful for more custom bucketing use cases +- when using date histogram, the interval can be set to **auto** - kibana decides the best interval automatically, e.g. a day if the range is a few months, or an hour if it is a day, etc +- **data table** - e.g. we want total sales for all salesmen + - we should use terms aggregation (since the buckets are dynamic). the field to use would be salesmen's id + - just like in bar chart etc, data table would have **split rows** and **split table** + - we add the metric to be sum of total + - now, just the salesmen's id and total might not be good to look at - we might want to add the salesmen's name. so, we use a "no op" metric like "top hits". this way, the top document of the bucket is used, and we use the field as salesman's name. in our case, that hardly matters because all orders in the same bucket have the same salesman + - we can order the buckets using one of the metrics - when configuring the bucket using salesmen's id, we configure it to order using the metric we added - sum of total (refer 3rd point) + - we can configure the data table to for e.g. display the total at the end + + ![data table](/assets/img/elasticsearch/data-table.png) + +- **heat maps** - the basic one which we might probably use is matrix based, but other use cases include - + - actual maps - e.g. which region has the most activity on our website + - on websites - e.g. like eye tracker - which areas of our website draw the most attention +- e.g. we would like to see the peak hours on the different pages of our website + - we already have an "hour of day" field to use histogram aggregation on. this way, we get it for each hour + - now, we use terms aggregation for the "url path" field + - each cell shows how many visits were there for a particular hour and url path. clearly, activity is more around 8am to 8pm + + ![heat map](/assets/img/elasticsearch/heat-map.png) + +- **tag clouds** - bubbles would be larger for the tags with higher values. e.g. bucket using cities, and measure sum of order totals
+ ![tag clouds](/assets/img/elasticsearch/tag-clouds.png) +- **dashboards** - orchestration of visualizations. they typically show all the visualizations from the same data view +- when we edit the visualizations for a dashboard, we can either modify the visualization itself, or override the settings of the visualization at the dashboard level, thus leaving the visualization as is +- **interactivity** - when we click on the chart, it automatically adds **filters** (they are like ksql i.e. present at the top) and update other visualizations using these filters as well. similarly, if we select some areas (like rectangles) on the charts, it will automatically set the timestamp filters on the dashboard and update other visualizations using this time range as well. e.g. refer the filter below on the left and the time range on the right, which were added based on our interactivity with the visualizations + ![interactivity](/assets/img/elasticsearch/interactivity.png) diff --git a/_posts/2024-03-02-low-level-design.md b/_posts/2024-03-02-low-level-design.md new file mode 100644 index 0000000..5b2a1ed --- /dev/null +++ b/_posts/2024-03-02-low-level-design.md @@ -0,0 +1,1276 @@ +--- +title: Low Level Design +mermaid: true +--- + +## Some Principles + +- **dry** - don't repeat yourself - the code should be changed in a single place only +- **yagni** - you aren't gonna need it - do not introduce / foresee features you will not need in future, e.g. undo functionality +- **kiss** - keep it simple and stupid - e.g. when adding a task in a todo application, do not think about priority, deadline etc - highlight only the title and description fields, and hide the rest of them as optional fields + +## SOLID Principles + +### Single Responsibility Principle + +- "a class should have only one reason to change" +- it should not handle multiple concerns +- this increases "cohesion" - only related code belongs together +- it improves readability +- it also makes writing focused tests easier + +### Open Closed Principle + +- "open for extension" - extend the functionality without touching existing code +- this is done using principles like composition, inheritance, etc +- "closed for modification" - do not add extra functionality to existing code, since it is already tested +- e.g. instead of bundling everything inside one class, have a generic `Writer` interface, and have different concrete implementations like `DBWriter`. for new functionality, we add a new writer `FileWriter` instead of touching the existing code + +### Liskov Substitution Principle + +- "sub classes should be able to substitute base classes" +- subclass should not reduce the feature set offered by base class, only increase it +- e.g. below violates liskov substitution - + +```java +class Vehicle { + + void startEngine() {} +} + +class Bicycle extends Vehicle { + + void startEngine() { + throw new RuntimeException("no engine present..."); + } +} +``` + +- solution - break into different interfaces - + +```java +class Vehicle {} + +class MotorVehicle { + + void startEngine() {} +} + +class Bicycle extends Vehicle {} +``` + +### Interface Segregation Principle + +- "clients should not be forced to depend on interfaces they do not use" +- this prevents "fat" interfaces +- example can be same as liskov above + +### Dependency Inversion Principle + +- "depend on abstractions, not concrete implementations" +- "decoupling" - modules will not have to change with change in underlying implementations +- "abstractions should not depend on details, but details should depend on abstractions" +- can be achieved through techniques like "dependency injection" - dependencies are provided to the class from outside instead of the class itself instantiating them +- thus implementations can also be swapped easily, e.g. - + +```java +class Computer { + + private final Keyboard keyboard; + private final Mouse mouse; + + Computer(Keyboard keyboard, Mouse mouse) { + this.keyboard = keyboard; + this.mouse = mouse; + } +} + +class BluetoothKeyboard implements Keyboard {} +class WiredKeyboard implements Keyboard {} + +class BluetoothMouse implements Mouse {} +class WiredMouse implements Mouse {} +``` + +## Object Oriented Analysis and Design using UML + +- procedural programming was about organizing code into blocks to help manipulate data +- oop organizes the code and wraps the data and functionality inside an object +- object oriented analysis - + - we identify the objects in a system + - we establish the relationship between them + - finally, we make the design that can be converted to executable code in our object oriented language +- uml or unified modelling language helps model the object oriented analysis +- it helps communicate design decisions easily by breaking down a complex system into smaller, understandable pieces + +### Use Case Diagrams + +- "use case" - set of actions the system can perform +- "actors" - external users of the system +- gives a high level functional behavior of the system +- models the relationship between actors and use cases, as well as between the different use cases +- "system boundary" - limit the scope of the system +- "include" - invocation of one use case by another use case (like invoking a method) +- "extend" - works like the base use case it extends with additional steps +- extend can also be used for conditional use cases. e.g. pay fine only on late returns, not all returns + +![use case](/assets/img/low-level-design/use-case.drawio.png) + +### Class Diagram + +- helps show how different entities relate to each other +- map directly to object oriented language +- the representation of class has three sections - class name, properties and methods +- "visibility" - we can put this ahead of the attributes / methods. `+` for public, `-` for private and `#` for protected and `~` for default +- "associations" - if two classes communicate with each other, there needs to be a link between them +- associations can be bidirectional (both classes are aware of each other) or unidirectional (only one class is aware) +- "multiplicity" - how many instances of the class participate in the relationship +- "inheritance" is also called an "is a" relationship. denoted by open arrows (the head is not filled) +- for abstract class, use italics +- composition / aggregation are also called a "has a" relationship +- "aggregation" - lifecycle of the child class is independent of the parent class. denoted by open arrows with diamonds at end +- "composition" - lifecycle of the child class is dependent on the parent class i.e. the child cannot exist independent of the parent. denoted by closed arrows with diamonds at end +- "generalization" - combining similar classes into a single class +- basic e.g. - + - inheritance between customer / admin and user + - composition (with multiplicity) between orders and customers + +```mermaid +classDiagram + +User <|-- Admin +User <|-- Customer +Order "*" *-- "1" Customer + +class User { + -name +} + +class Order { + -customerId + -creationDate + -shippingDate + +place() +} + +class Admin { + +updateCatalog() +} + +class Customer { + +register() + +login() +} +``` + +### Sequence Diagrams + +- sequence of interactions in terms of messages +- the vertical dimension represents the chronological order of the messages +- the horizontal dimension shows the messages that are sent +- used for "dynamic modelling" i.e. how objects interact with each other + +```mermaid +sequenceDiagram + +participant Customer +participant ATM +participant Account +participant Screen + +Customer->>ATM: Balance Inquiry +ATM->>Account: Get Balance +Account->>ATM: Balance +ATM->>Screen: Display Balance +Screen->>Customer: Show Message +``` + +### Activity Diagrams + +- flow of control from one activity to another +- "activity" - an operation that results in a change of state +- used for "functional modelling" i.e. how inputs map to outputs + +![activity](/assets/img/low-level-design/activity.drawio.png) + +## Design Patterns Introduction + +- problems that occur frequently have well defined solutions +- three broad categories - creational, structural, behavioral +- creational - how objects are constructed from classes +- structural - composition of classes i.e. how classes are constructed +- behavioral - interaction of classes and objects with one another and the delegation of responsibility + +## Creational Patterns + +### Builder Pattern + +- separate the representation of object from its construction process +- e.g. helps prevent "telescoping constructors" - + ```java + Aircraft(Engine engine); + Aircraft(Engine engine, Cockpit cockpit); + Aircraft(Engine engine, Cockpit cockpit, Bathroom bathroom); + ``` +- "product" - what we want to create - aircraft here +- we have a "builder" interface +- implementations of this builder are called "concrete builders" +- the builder has empty / default implementations +- this way, the builder methods can be selectively overridden depending on variant +- "director" - has the "algorithm" to help create products using builders +- sometimes, the director can be skipped - the client invokes the methods on builder directly +- pretty similar to [abstract factory](#abstract-factory-pattern) + +
+code example +
+
+abstract class AircraftBuilder {
+
+  void buildCockpit() {}
+  void buildEngine() {}
+  void buildBathroom() {}
+  Aircraft getResult() {}
+}
+                                          // no bathrooms in f16
+class BoeingBuilder                       class F16Builder {
+    extends AircraftBuilder {                 extends AircraftBuilder {
+
+  @Override void buildCockpit() {}          @Override void buildCockpit() {}
+  @Override void buildEngine() {}           @Override void buildEngine() {}
+  @Override void buildBathroom() {}         @Override Aircraft getResult() {}
+  @Override Aircraft getResult() {}       }
+}
+
+class Director {
+
+  AircraftBuilder aircraftBuilder;
+
+  Aircraft construct(boolean isPassenger) {
+    aircraftBuilder.buildCockpit();
+    aircraftBuilder.buildEngine();
+    if (isPassenger) {
+      aircraftBuilder.buildBathroom();
+    }
+    return aircraftBuilder.getResult();
+  }
+}
+
+
+
+ +### Singleton Pattern + +- create only one instance of a class +- e.g. thread pool, registries, etc +- we make the constructor "private" so that other classes cannot instantiate it +- some methods have been discussed below + +
+not thread safe +
+
+class AirForceOne {
+
+  private static AirForceOne instance;
+
+  private AirForceOne() { }
+
+  public static AirForceOne getInstance() {
+
+    if (instance == null) {
+      instance = new AirForceOne();
+    }
+
+    return instance;
+  }
+}
+
+
+
+ +
+synchronized - makes code slow as every invocation acquires a lock +
+
+synchronized public static AirForceOne getInstance() {
+  // ...
+}
+
+
+
+ +
+static initialization - if instantiation is expensive, it can cost us performance if object is never used +
+
+private static AirForceOne instance = new AirForceOne();
+
+
+
+ +
+"double checked locking" - solves all problems, but not generally recommended +
+
+class AirForceOne {
+
+  // IMP - notice the use of volatile
+  private volatile static AirForceOne instance;
+
+  private AirForceOne() { }
+
+  public static AirForceOne getInstance() {
+
+    if (instance == null) {
+      synchronized(AirForceOne.class) {
+        if (instance == null) {
+          instance = new AirForceOne();
+        }
+      }
+    }
+
+    return instance;
+  }
+}
+
+
+
+ +### Prototype Pattern + +- create new objects by copying existing objects +- "prototype" - the seed object from which other objects get created +- sometimes, cloning can be more performant than creating entirely new instances +- another advantage - instead of too many subclasses, vary behavior by changing fields - two separate classes for boeing and f16 are not required below +- use case - "dynamic loading" - e.g. we do not have access to constructors. the runtime environment registers prototypes with the "prototype manager", so that whenever an object is requested, a copy is returned by this prototype manager +- "shallow" vs "deep" copy - nested fields would be shared in shallow copy unlike in deep + +
+code example +
+
+class F16 implements Aircraft {
+
+  void setEngine(Engine engine) { }
+
+  Aircraft clone() { /* deep copy */ }
+}
+
+Aircraft f16A = aircraft.clone();    Aircraft f16B = aircraft.clone();
+f16A.setEngine(f16AEngine);          f16B.setEngine(f16BEngine);
+
+
+
+ +### Factory Method Pattern + +- delegate the actual instantiation to subclasses +- the factory method may or may not provide a default implementation +- the subclass will override this implementation +- downside - compare with [prototype pattern](#prototype-pattern) - it results in too many subclasses + +
+code example +
+
+class F16 {
+
+  protected Aircraft makeF16() {
+    cockpit = new Cockpit();
+  }
+}
+
+class F16A extends F16 {            class F16B extends F16 {
+
+  @Override                           @Override
+  public Aircraft makeF16() {         public Aircraft makeF16() {
+    super.makeF16();                    super.makeF16();
+    engine = new F16AEngine();          engine = new F16BEngine();
+  }                                   }
+}                                   }
+
+F16 f16A = new F16B(); f16A.makeF16();
+F16 f16B = new F16B(); f16B.makeF16();
+
+
+
+ +### Abstract Factory Pattern + +- creating families of related objects without specifying their concrete classes +- we have "abstract factory" returning "abstract products" +- "concrete factories" override these abstract factory methods and return "concrete products" +- now, only the right concrete factory needs to be passed to the aircraft to construct it +- in [factory method pattern](#factory-method-pattern), we were using inheritance to create a single product +- here, we create a family of products using composition +- concrete factories can be [singleton](#singleton-pattern) + +
+code example +
+
+class Aircraft {
+
+  void makeAircraft(AircraftFactory aircraftFactory) {
+    engine = aircraftFactory.makeEngine();
+    cockpit = aircraftFactory.makeCockpit();
+  }
+}
+
+interface AircraftFactory {
+
+  Engine makeEngine();
+  Cockpit makeCockpit();
+}
+
+class BoeingAircraftFactory implements AircraftFactory {
+
+  @Override Engine makeEngine() { return new BoeingEngine(); }
+  @Override Cockpit makeCockpit() { return new BoeingCockpit(); }
+}
+
+class F16AircraftFactory implements AircraftFactory {
+
+  @Override Engine makeEngine() { return new F16Engine(); }
+  @Override Cockpit makeCockpit() { return new F16Cockpit(); }
+}
+
+
+
+ +## Structural Patterns + +### Adapter Pattern + +- allows incompatible classes to work together by converting the interface of one class into another +- e.g. our aircraft business now needs to accommodate hot air balloons +- "adaptee" is the incompatible class - hot air balloon +- "target" is the interface the client (i.e. our code) understands - aircraft +- "adapter" is the class sitting in between, which is composed using adaptee and implements the target +- usually done after a system is designed to accommodate to fit additional requirements +- this entire process discussed above is called "object adapter" +- we can also use the "class adapter" pattern - where the adapter extends both the adaptee and the target +- disadvantage - multiple inheritance is not supported by java + +
+code example +
+
+interface Aircraft {
+
+  void takeOff();
+}
+
+class Adapter implements Aircraft {
+
+  HotAirBalloon hotAirBalloon;
+
+  Adapter(HotAirBalloon hotAirBalloon) {
+    this.hotAirBalloon = hotAirBalloon;
+  }
+
+  @Override
+  void takeOff() {
+    hotAirBalloon.inflateAndFly();
+  }
+}
+
+// now, client can use adapter like any other `Aircraft`
+
+
+
+ +### Bridge Pattern + +- helps separate abstraction and implementation into two different class hierarchies +- e.g. we have two shapes - circle and square +- now, we want to introduce two colors - blue and red +- we will end up with four classes - blue circle, blue square, red circle, red square +- this can grow exponentially +- another problem - changes to color and shape effect each other - they are not decoupled +- so, we split into two separate hierarchies - shape and color +- so, we have "abstraction" and "refined abstraction" (shapes) +- then, we have "implementation" and "concrete implementation" (colors) +- so, instead of inheritance, we use composition +- we compose the refined abstractions using the concrete implementations + +
+code example +
+
+class Shape {
+
+  private Color color;
+
+  Shape(Color color) {
+    this.color = color;
+  }
+}
+
+class Circle {              class Square {
+
+  Circle(Color color) {       Square(Color color) {
+    super(color);               super(color);
+  }                           }
+}                           }
+
+interface Color {}
+class Red implements Color {}
+class Blue implements Color {}
+
+
+
+ +### Composite Pattern + +- helps compose our model in a tree like structure and work with them +- e.g. an air force can have several levels of nested air forces, and ultimately the last level of air force would be composed of planes +- "composite" - helps model the trees / subtrees +- "leaves" - the last level in these trees +- "component" - both the leaf and composite are coded to this common interface +- now, the client can simply call `getPersonnel` and treat the composite / leaf as the same +- it uses [internal iterator](#iterator-pattern) - the iterator is not exposed, and is handled by the composite itself + +
+code example +
+
+interface Alliance {
+
+  int getPersonnel();
+}
+
+class AirForce implements Alliance {
+
+  private Alliance[] alliances;
+
+  @Override
+  int getPersonnel() {
+
+    int personnel = 0;
+
+    for (Alliance alliance : alliances) {
+      personnel += alliance.getPersonnel();
+    }
+
+    return personnel;
+  }
+}
+
+interface Aircraft { }
+
+class F16 implements Aircraft, Alliance {
+
+  @Override
+  int getPersonnel() {
+    return 2;
+  }
+}
+
+class Boeing implements Aircraft, Alliance {
+
+  @Override
+  int getPersonnel() {
+    return 10;
+  }
+}
+
+
+
+ +### Decorator Pattern + +- extend the behavior of an object dynamically +- the decorator basically adds to the existing functionality, by for e.g. taking some action before / after invoking the method on the wrapped component +- alternative to creating more subclasses +- e.g. below, the luxury and bulletproof variants could have been subclasses of boeing as well +- but then we could not wrap a different aircraft with different decorators +- "component" - the common interface to which the component and decorator is coded +- "concrete component" - what we wrap +- "decorator" - an interface for different decorators. this will also extend the component +- "concrete decorator" - the actual implementation of decorators. they wrap the concrete components +- we can wrap using multiple decorators as well +- e.g. below, we can make an aircraft bulletproof and luxurious, which affects its weight but its flying method stays the same +- the advantage is that the client code is agnostic of all this - it still codes to component +- notice how the decorator is composed using the component + +
+code example +
+
+interface Aircraft {
+  
+  void fly();
+  
+  int getWeight();
+}
+
+class Boeing implements Aircraft {    class F16 implements Aircraft {
+
+  @Override                             @Override
+  public void fly() {                   public void fly() {
+    System.out.println("flying");         System.out.println("soaring");
+  }                                     }
+
+  @Override                             @Override
+  public int getWeight() {              public int getWeight() {
+    return baseWeight;                    return baseWeight;
+  }                                     }
+}                                     }
+
+abstract class Decorator implements Aircraft { }
+
+class BulletProofDecorator extends Decorator {
+
+  Aircraft aircraft;
+
+  @Override
+  public void fly() {
+    aircraft.fly();
+  }
+
+  @Override
+  public int getWeight() {
+    return aircraft.getWeight() + 13;
+  }
+}
+
+class LuxuriousDecorator extends Decorator {
+
+  Aircraft aircraft;
+
+  @Override
+  public void fly() {
+    aircraft.fly();
+  }
+
+  @Override
+  public int getWeight() {
+    return aircraft.getWeight() + 27;
+  }
+}
+
+Aircraft boeing = new Boeing();
+Aircraft ceoPlane = new BulletProofDecorator(new LuxuriousDecorator(boeing));
+boeing.getWeight(); // cumulated weight
+
+
+
+ +### Facade Pattern + +- a single uber interface to a subsystem to make working with it easier +- the client will now interface with the "facade" and not worry about the complexities of the subsystem +- changes to the subsystem will now affect the facade and not the client + +
+code example +
+
+class AutopilotFacade {
+
+  private BoeingAltitudeMonitor altitudeMonitor;
+  private BoeingEngineController engineController;
+  private BoeingNavigationSystem navigationSystem;
+
+  AutopilotFacade(BoeingAltitudeMonitor altitudeMonitor,
+      BoeingEngineController engineController, 
+      BoeingNavigationSystem navigationSystem) {
+    this.altitudeMonitor = altitudeMonitor;
+    this.engineController = engineController;
+    this.navigationSystem = navigationSystem;
+  }
+
+  void autopilotOn() {
+    altitudeMonitor.autoMonitor();
+    engineController.setEngineSpeed(700);
+    navigationSystem.setDirectionBasedOnSpeed(engineController.getEngineSpeed());
+  }
+
+  void autopilotOff() {
+    altitudeMonitor.turnOff();
+    engineController.turnOff();
+    navigationSystem.turnOff();
+  }
+}
+
+
+
+ +### Flyweight + +- sharing state among objects for efficiency +- e.g. if we use a global radar to track air crafts, we will end up with too many air craft objects for the same air craft at different coordinates +- "intrinsic state" - independent of the context of object. e.g. top speed of the air craft +- "extrinsic state" - dependent of the context of object. e.g. coordinates of the air craft +- so, to prevent creation of too many objects, we store intrinsic state inside the object, while extrinsic state outside it +- this way, we automatically end up with less objects, since we only need new objects when the intrinsic state changes, and not every time the extrinsic state changes +- "flyweight" - the object has become light since it only stores intrinsic state now +- "flyweight factory" - used to create the flyweight objects, because we do not want the client to create them directly +- "context" - used to store the extrinsic state + +
+code example +
+
+class F16 implements IAircraft {
+
+  private final int topSpeed = 800;
+
+  int getTimeToDestination(int curX, int curY, int destX, int destY) {
+    int distance = ...;
+    return distance / topSpeed;
+  }
+}
+
+
+
+ +### Proxy Pattern + +- calls to the "real subject" are hidden behind a "proxy" +- this way, the real subject is shielded from the client +- both implement the "subject" interface so that the client code does not change +- e.g. client will call methods like turn left and turn right on remote control +- the remote control will call these methods on the drone +- both of them implement an interface called `IDrone` +- "remote proxy" - when the real subject is located on a remote server, the calls made by the client actually reaches a proxy first +- the proxy sits on the same jvm, and the proxy then makes the request over the network to the real subject on the remote server +- "virtual proxy" - delays the object creation when it is expensive +- e.g. we see loading overlays or wire frames with same height and width while expensive pictures are loading +- "protection proxy" - acts as an authorization layer in between + +## Behavioral Patterns + +### Chain of Responsibility Pattern + +- decoupling the sender of a request from its receiver +- passing it along a chain of handlers till one of the handlers handle it or the request falls off the chain and remains unhandled +- use this pattern when a request can be handled by multiple objects and it is not known in advance which one will end up handling it +- we have a "handler" which all "concrete handlers" implement +- notice how all handlers maintain a reference to their successor + +
+code example +
+
+class ErrorCodes {
+
+  static final int LOW_FUEL = 1;
+  static final int HIGH_ALTITUDE = 2;
+}
+
+class Handler {
+
+  Handler next;
+
+  Handler(Handler next) {
+    this.next = next;
+  }
+
+  void handleRequest(int errorCode) {
+    if (next != null) {
+        next.handleRequest(errorCode);
+    }
+  }
+}
+
+class LowFuelHandler extends Handler {          class HighAltitudeHandler extends Handler {
+
+  LowFuelHandler(Handler next) {                  HighAltitudeHandler(Handler next) {
+    super(next);                                    super(next);
+  }                                               }
+
+  void handleRequest(int errorCode) {             void handleRequest(int errorCode) {
+    if (errorCode == ErrorCodes.LOW_FUEL) {         if (errorCode == ErrorCodes.HIGH_ALTITUDE) {
+      // ...                                          // ...
+    } else {                                        } else {
+      super.handleRequest(errorCode);                 super.handleRequest(errorCode);
+    }                                               }
+  }                                               }
+}                                               }
+
+
+
+ +### Observer Pattern + +- "observers" subscribe to "subjects" for state changes +- so, we have "observer" and "concrete observers", "subject" and "concrete subjects" +- "push model" - the subject will push the new state into the observer when calling its update method +- "pull model" - the subject will call the observer's update method using itself i.e. `this` +- then, the observer can call the getter method on the subject which can expose individual bits of state + +
+code example +
+
+interface ISubject {
+
+  void addObserver(IObserver observer);
+
+  void removeObserver(IObserver observer);
+
+  void notifyObservers();
+}
+
+interface IObserver {
+
+  void update(Object newState);
+}
+
+public class ControlTower implements ISubject {
+
+  List\ observers = new ArrayList<>();
+
+  @Override
+  public void addObserver(IObserver observer) {
+    observers.add(observer);
+  }
+
+  @Override
+  public void removeObserver(IObserver observer) {
+    observers.remove(observer);
+  }
+
+  // assume some poller calls this every 5 seconds
+  // with the current weather conditions etc
+  @Override
+  public void notifyObservers(Object newState) {
+    for (IObserver observer : observers) {
+      observer.update(newState);
+    }
+  }
+}
+
+class F16 implements IObserver {
+
+  ISubject subject;
+
+  public F16(ISubject subject) {
+    this.subject = subject;
+    subject.addObserver(this);
+  }
+
+  @Override
+  public void land() {
+    subject.removeObserver(this);
+  }
+
+  @Override
+  public void update(Object newState) {
+    // take appropriate action based on weather etc
+  }
+}
+
+
+
+ +### Interpreter Pattern + +- a grammar defines if some code is syntactically correct or not +- "context free grammar" - has the following components - + - start symbol + - set of terminal symbols + - set of non terminal symbols + - set of production rules +- we keep expanding the non terminal symbols till we reach the terminal symbols +- any final expression we can derive is called a "sentence" +- the sentence is said to be in the "language of grammar" we defined +- e.g. we have three operations in a flight simulation software - glide, barrel roll, splits +- we cannot perform barrel rolls and splits one after another +- we need to start and end with glide +- the production rules will look like as follows - + ``` + -> + -> glide + -> barrel roll + -> splits + ``` +- ast (abstract syntax tree) - can be used to represent the sentences in our grammar +- in this ast, the internal nodes are non terminal symbols, while leaf nodes are terminal symbols +- an ast example -
+ ![interpreter ast](/assets/img/low-level-design/interpreter-ast.drawio.png) +- "abstract expression" - the interface +- the abstract expression can be a "terminal expression" or a "non terminal expression" +- the non terminal expression will hold a reference to the other abstract expressions based on the production rules +- how we interpret an expression depends on the "context" + +
+code example +
+
+interface AbstractExpression {
+
+  void interpret(Context context);
+}
+
+class Context {}
+
+class Flight implements AbstractExpression {
+
+  private AbstractExpression flightOne;
+  private AbstractExpression showOff;
+  private AbstractExpression flightTwo;
+
+  @Override
+  public void interpret(Context context) {
+  }
+}
+
+class ShowOff implements AbstractExpression {
+
+  private AbstractExpression barrelRoll;
+  private AbstractExpression splits;
+
+  @Override
+  public void interpret(Context context) {
+  }
+}
+
+class Glide implements AbstractExpression {
+
+  @Override
+  public void interpret(Context context) {
+  }
+}
+
+class BarrelRoll implements AbstractExpression { 
+
+  @Override
+  public void interpret(Context context) {
+  }
+}
+
+class Splits implements AbstractExpression {
+
+  @Override
+  public void interpret(Context context) {
+  }
+}
+
+
+
+ +### Command Pattern + +- represent an action or a request as an object +- this can then be passed to other objects as parameters +- these requests can then be queued for later execution +- think of it like "callbacks" +- e.g. when we press a button, it does not need not know what to do +- it only needs to know the object that knows what to do +- "receiver" - the object that knows what to do - `MissileLauncher` in this case +- "command" and "concrete command" - the command is composed of the receiver. it is the abstraction layer - `Command` and `FireMissileCommand` in this case +- "invoker" - invokes the command - it is unaware of the underlying implementation of the command - `AircraftPanel` in this case +- "macro command" - setup a series of command objects in another command object. all these command objects will be invoked when invoking this macro command. this is a combination of [composite pattern](#composite-pattern) + command pattern + +
+code example +
+
+interface Command {
+
+  void execute()
+}
+
+class FireMissileCommand implements Command {
+
+  MissileLauncher missileLauncher;
+
+  @Override
+  void execute() {
+    missileLauncher.fire();
+  }
+}
+
+class AircraftPanel {
+
+  Command[] commands = new Command[10];
+
+  void setCommand(int i, Command command) {
+    commands[i] = command;
+  }
+
+  void fire() {
+    commands[3].execute();
+  }
+}
+
+
+
+ +### Iterator Pattern + +- traverse the elements of a aggregate without exposing the internal implementation +- so, we have "iterator" and "concrete iterator", "aggregate" and "concrete aggregate" +- "external iterator" - the client requests for the next element and performs the operation +- "internal iterator" - the client hands over the operation to perform to the iterator +- this way, the iterator is never exposed to the client +- e.g. [composite pattern](#composite-pattern) typically uses internal iterators +- below, we have multiple aggregates, each having their own iterator but everything is hidden behind one iterator + +
+code example +
+
+public interface Iterator {
+
+  IAircraft next();
+
+  boolean hasNext();
+}
+
+public class AirForceIterator implements Iterator {
+
+  List\ jets;
+  IAircraft[] helis;
+  
+  int jetsPosition = 0;
+  int helisPosition = 0;
+
+  public AirForceIterator(AirForce airForce) {
+    jets = airForce.getJets();
+    helis = airForce.getHelis();
+  }
+
+  @Override
+  public IAircraft next() {
+
+    if (helisPosition < helis.length) {
+      return helis[helisPosition++];
+    }
+
+    if (jetsPosition < jets.size()) {
+      return jets.get(jetsPosition++);
+    }
+
+    throw new RuntimeException("No more elements");
+  }
+
+  @Override
+  boolean hasNext() {
+
+    return helis.length > helisPosition ||
+      jets.size() > jetsPosition;
+  }
+}
+
+
+
+ +### Mediator Pattern + +- encourage lose coupling between interacting objects +- by encapsulating interactions in a "mediator" object +- the interacting objects are called "colleagues" and "concrete colleagues" +- use when interactions between the colleagues becomes very complex +- the colleagues are involved in many to many interactions, but with the mediator, it becomes one to many from mediator to colleagues +- we can often combine the mediator pattern with the [observer pattern](#observer-pattern) as well +- e.g. a runway needs to be free for an air craft to land +- instead of all air crafts looking at each other if the runway is being used, we can use a control tower that manages all of this for us + +
+code example +
+
+class Aircraft {
+    
+  ControlTower controlTower;
+
+  void startLanding() {
+    controlTower.queueForLanding(this);
+  }
+
+  void land() {
+    System.out.println("pull out wheels");
+  }
+}
+
+class ControlTower {
+
+  Queue\ aircraftQueue;
+
+  void queueForLanding(Aircraft aircraft) {
+    aircraftQueue.enqueue(aircraft);
+  }
+
+  @Schedule("2 minutes")
+  void allowLanding() {
+    if (!queue.isEmpty()) {
+      queue.dequeue().land();
+    }
+  }
+}
+
+
+
+ +### Memento Pattern + +- capture the internal state of an object without exposing its internal structure +- so that the object can be restored to this state later +- "originator" - the object whose state is captured +- "memento" - the snapshot / the state which was captured +- "caretaker" - the object that holds the memento +- since `getState` is private, outside classes like for e.g. the caretaker cannot call `getState`, only the originator can + +
+code example +
+
+class State { }
+
+class Originator {
+
+  static class Memento {
+
+    private State state;
+
+    Memento(State state) {
+      this.state = state;
+    }
+
+    private State getState() {
+      return state;
+    }
+  }
+  
+  private State state;
+
+  public Memento save() {
+    return new Memento(state);
+  }
+
+  public void restore(Memento memento) {
+    this.state = memento.getState();
+  }
+}
+
+class Caretaker {
+
+  private Stack\ history;
+  private Originator originator;
+
+  void takeSnapshot() {
+    Memento memento = originator.save();
+    history.push(memento);
+  }
+
+  void undo() {
+    Memento memento = history.pop();
+    originator.restore(memento);
+  }
+}
+
+
+
+ +### State Pattern - TODO + +- alter behavior of the object as its state changes +- so that it appears to change its class +- TODO: remaining + +### Template Method Pattern + +- subclasses define parts of the algorithm without modifying the overall structure of the algorithm +- "template method" - the common part stays in the base class +- "hook method" - the variable part is overridden by the subclasses +- the base class can provide default implementations for these hook methods if needed +- the template method can be made final +- e.g. pre flight checks can be the template method, which checks + - fuel levels + - air pressure + - if the door is locked +- all these can be hooks i.e. specific to the aircraft +- helps avoid "dependency rot" - where dependencies at various levels depend on each other horizontally and vertically +- [factory method pattern](#factory-method-pattern) is a special form of the template method pattern + +### Strategy Pattern + +- make algorithms belonging to the same family easily interchangeable +- "strategy" - the common interface +- "concrete strategy" - the actual implementation of the different algorithms +- "context" - uses the strategy +- the context is composed using the strategy +- context can use a default strategy as well to lessen the burden on client + +
+code example +
+
+interface ISort {
+
+  void sort(int[] input);
+}
+
+class BubbleSort implements ISort {   class MergeSort implements ISort {
+
+  @Override                             @Override
+  void sort(int[] input) {              void sort(int[] input) {
+  }                                     }
+}                                     }
+
+class Context {
+
+  private ISort howDoISort;
+
+  public Context(ISort howDoISort) {
+    this.howDoISort = howDoISort;
+  }
+
+  void sort(int[] numbers) {
+      howDoISort.sort(numbers);
+  }
+}
+
+
+
+ +### Visitor Pattern - TODO + +- define operations for elements of an object without changing the class of this object +- e.g. assume we want to monitor several metrics like fuel, altitude, etc on all the air crafts +- option - introduce all these methods on each of the concrete aircraft classes +- issue - we are bloating our aircraft class +- solution - we use the visitor pattern +- note how the visitor pattern will have a separate method for each of the concrete class +- so, we have "element" and "concrete element", "visitor" and "concrete visitor" +- the concrete element will call its corresponding method on the visitor +- if concrete elements increase, we will have to modify all visitors +- so, use the visitor pattern when the element hierarchy is stable but we keep adding new functionality to visitors + +
+code example +
+
+interface Aircraft {
+
+  void accept(AircraftVisitor visitor);
+}
+
+class Boeing implements Aircraft {          class F16 implements Aircraft {
+
+  void accept(AircraftVisitor visitor) {      void accept(AircraftVisitor visitor) {
+    visitor.visitBoeing(visitor);               visitor.visitF16(visitor);
+  }                                           }
+}                                           }
+
+interface AircraftVisitor {
+
+  void visitBoeing(Boeing boeing);
+
+  void visitF16(F16 f16);
+}
+
+class FuelVisitor implements AircraftVisitor {   class DoorVisitor implements AircraftVisitor {
+
+  void visitBoeing(Boeing boeing) {}               void visitBoeing(Boeing boeing) {}
+  
+  void visitF16(F16 f16) {}                        void visitF16(F16 f16) {}
+}                                                }
+
+
+
diff --git a/_posts/2024-03-09-high-level-design.md b/_posts/2024-03-09-high-level-design.md new file mode 100644 index 0000000..d11b63a --- /dev/null +++ b/_posts/2024-03-09-high-level-design.md @@ -0,0 +1,1622 @@ +--- +title: High Level Design +--- + +## Software Architecture + +- what is software architecture - + - high level design - hide implementations and express in terms of abstractions + - of the different components + - and how they interact with each other + - to fulfil requirements (what it should do) and constraints (what it should not do) +- software development lifecycle - we can repeat this process again and again + - design + - implementation + - testing + - deployment +- software architecture is the output of the first step / input to the second step +- decisions at the bigger level cannot be changed easily, cost a lot of wasted effort, etc so we need to make good decisions + +## System Requirements + +- the scope of the problem / the number of ways to solve a problem increases as the abstraction increases from designing a method -> class -> module -> application +- the ambiguous problem needs to be converted to a technical problem +- we might need to ask clarifying questions to the client +- different types of requirements - + - features of the system + - quality attributes + - system constraints + +### Features of the System + +- express the actual "functional requirements" of the system +- e.g. hitchhiking service - allow users to share rides +- identify all the "actors" and "use cases" +- expand each "use case" through a "flow of events" - we can use a [sequence diagram](/posts/low-level-design/#sequence-diagrams) for this + +![features of the system](/assets/img/high-level-design/features-of-the-system.svg) + +### Quality Attributes + +- to address the "non functional requirements" of the system +- how well a system should perform in a [particular dimension](#important-quality-attributes) +- [important quality attributes](#important-quality-attributes) include [performance](#performance), [scalability](#scalability), [availability](#availability), [fault tolerance](#fault-tolerance), etc +- have a direct impact on the technical decisions of the system unlike [features of the system](#features-of-the-system) +- e.g. show products when searched for under 100ms, system should be available 99.9% of the time, etc +- they have to "measurable" and "testable" +- need to make "tradeoffs" - there is no one architecture that can address all problems +- sometimes, clients might make "infeasible" requirements - 100% availability, unlimited storage, etc. we should call them out + +### System Constraints + +- limitations and boundaries of a system +- three types of constraints - technical, business and regulatory +- "technical constraints" - e.g. lockin to a particular database, cloud vendor, software license, etc +- "business constraints" - time and budget limitations +- "regulatory constraints" - e.g. location specific +- we should avoid tight coupling, else we would have constraints specific to hardware etc + +## Important Quality Attributes + +### Performance + +- "response time" - time between client sending a request and receiving a response +- response time = "processing time" + "waiting time" +- processing time - time spent in performing the actual business logic +- waiting time - time spent in transit, waiting queues, etc +- waiting time is also called "latency", while response time is also called "end to end latency" +- response time is critical when a request is in the path of a user interaction - users do not like to wait +- "throughput" - can be + - either "number of tasks performed per unit of time" + - or "amount of data processed per unit time" i.e. bits per second etc +- throughput can be useful when for e.g. analyzing a constant stream of logs from several sources +- consideration 1 (response time) - e.g. we as developers think our processing time is 10 ms so response time is 10ms, but assume our server can only process one request at a time +- if we get two concurrent requests, the waiting time for the second request will be 10ms, thus increasing its response time to 20ms +- so, response time is affected by waiting time as well +- consideration 2 (response time) - response times for some requests in our system will be very bad, while all others would be relatively better +- these relatively slow response times are called "tail latency" +- so, instead of metrics like median or average, the most effective way to measure response times is a "percentile distribution chart", instead of just using median or average +- in this chart, the "xth percentile" is the value below which x% of the values can be found +- refer the part around 100th percentile in the percentile distribution graph below for tail latency +- so, we would set [slo](#sla-slo-and-sli) like so - 95th percentile of requests should have 30ms response times + +![percentile distribution response time](/assets/img/high-level-design/percentile-distribution-response-time.svg) + +- consideration 3 (both response time and throughput) - effect of load - the point where the response time starts increasing / throughput starts decreasing due to increase in load is called the "degradation point" + +![degradation point](/assets/img/high-level-design/degradation-point.svg) + +### Scalability + +- the load on our system never stays the same - seasonal traffic e.g. during holidays +- "scalability" - systems capability to handle growing amount of load +- scalability are of three types +- "vertical scalability" - adding more resources / upgrading existing resources on a single machine +- advantage - no code changes are needed, migration is straightforward +- disadvantage - + - there is a limit to which we can scale vertically + - does not provide [high availability](#availability) or [fault tolerance](#fault-tolerance) +- "horizontal scalability" - adding more instances on different machines +- advantage - + - no limit to scalability + - more importantly - provides [high availability](#availability) or [fault tolerance](#fault-tolerance) +- disadvantage - + - code changes might be required + - overhead around coordination is introduced +- "team / organization scalability" - as we add more engineers, productivity decreases after a certain point +- we can split codebase into separate modules or better, architecture into separate services to decrease conflicts + +### Availability + +- "availability" - fraction of time our system is operational +- so, availability = uptime / (uptime + downtime) +- mtbf - "mean time between failures" and mttr - "mean time to recovery" (both are self explanatory) +- so, we can also say availability = mtbf / (mtbf + mttr) +- so, one way to ensure high availability is to reduce mttr i.e. detect and resolve issues in near 0 time +- 99.9% means ~9h of downtime in a year + +### Fault Tolerance + +- there can be "human errors" (e.g. faulty config), "software errors" (out of memory exceptions) or "hardware failures" (infrastructure issues / outage) +- failures are inevitable +- "fault tolerance" - helps keep system operational (i.e. [available](#availability)) despite failure of multiple components +- fault tolerance tactics - prevention, detection / isolation and recovery +- "failure prevention" - eliminate single points of failures. use "replication" and "redundancy" for this. two strategies - + - "active active architecture" - requests can go to any replica. so, all of them have to be kept in sync. so, if one of them goes down, the remaining one will still continue to operate. advantage - helps balance load, since it is like [horizontal scalability](#scalability). disadvantage - keeping all replicas in sync is non trivial + - "active passive architecture" - one primary replica takes all the requests, while the passive replicas take periodic snapshots of the active replica. disadvantage - we cannot [scale](#scalability) our system horizontally, since we are still restricted to the one active replica. advantage - this leader follower pattern is much easier to implement +- "failure detection / isolation" - if we have a faulty replica, our system should be able to detect it and isolate it +- this is done by a monitoring service using - + - health checks - monitor service polling the servers periodically + - heartbeats - the servers sending heartbeats to the monitoring service periodically +- monitoring service can be more complex - declare a host to be failed based on its error rate, if its response time has suddenly increased, etc +- "failure recovery" - some strategies - + - stop sending traffic to the faulty instance + - attempt to restart the host + - "rollback" - + - rollback service to a stable version + - rollback databases when it reaches an inconsistent state to a previous consistent state + +### SLA, SLO and SLI + +- sla - "service level agreement" - agreement between the service provider and client +- if we fail to deliver these sla, we have to provide refunds, license extensions, etc to clients +- slo - "service level objective" - goals we set for our systems +- each slo can represent one of the [quality attributes](#important-quality-attributes) +- an sla is basically a collection of slo +- even if we do not have an sla, we should have slo so that our users know what they can expect from us +- sli - "service level indicator" - quantitative measure of the different [quality attributes](#important-quality-attributes) +- achieved using monitoring services +- we can compare what we see in sli to what we define in slo +- this is why we said [quality attributes](#quality-attributes) should be measurable and testable - otherwise, we would not have been able to measure our slo using sli +- general guide - based on what clients ask, we should define slo and then find out matching sli +- another technique - define loser external slo but stricter internal slo + +## API Design + +- api - "application programming interface" +- the interface is a "contract" between our systems and the client +- our system becomes a "black box" - the client need not know the internal implementation of this api, they just have to interact with this api +- once we define the apis, our clients can start integrating with it without us actually having built its implementation entirely +- it is called remotely over the network +- apis can be public, private / internal and partner +- "public api" - exposed to general public and any developer can call them. might require registration from users first +- "private api" - used by other systems inside the organization, but not exposed outside the organization +- "partner api" - to organizations having a business relationship with us +- two types of apis we discuss - rpc and rest api + +### Good Practices for API Design + +- "encapsulation" - clients should not have to care about implementation +- we can change the implementation without the client changing anything on its end +- "ease of use" - descriptive actions and resources, keeping it consistent +- "idempotent operations" - no effect if the operation is performed > once. updating the address is idempotent, increasing balance by 100 is not +- assume there is an error due to some reason - the request is lost / response to the message is lost (but the request was processed) +- now, the client does not know which one happened +- so, even if it retries the operation, it should not have any consequences +- "pagination" for large responses - the client can provide the offset and the number of items to retrieve +- "asynchronous operations" - some operations are very big, and we cannot provide any reasonable response immediately +- instead of the client having to wait for something like this, we can use asynchronous operations +- the immediate response includes an identifier which the client can use to track the progress of the operation +- "versioning" - allows us to make non backward compatible changes to the api + +### RPC + +- rpc - "remote procedure calls" +- ability of a client to execute a subroutine on a remote server +- "location transparency" - calling an rpc looks like calling a local method +- applications written in different programming languages can also talk using rpc +- idl - "interface description language" - we define the api and data types in this language +- then, the rpc framework we use generates 2 separate implementations - "server stub" and "client stub" +- they include the corresponding classes for the api and data types we define in the interface description language +- rpc will also take care of marshalling / unmarshalling the request / response for us automatically +- it might include propagation of exception etc as well +- rpc helps the clients focus on performing an action on the server systems, and not worry about the network communication +- drawbacks - + - remote methods are a lot slower and unreliable. the client execution will thus be blocked. so, we should try writing asynchronous versions for the remote methods + - it is also not useful when we want the features like cookies, headers etc +- popular frameworks - + - grpc by google - high performance rpc. uses "http/2" for transport and "protocol buffers" as the interface description language + - apache thrift - by facebook + - java rmi (remote method invocation) - unlike above two, specific to java - helps one jvm invoke a method on another jvm + +### Rest API + +- rest - "representational state transfer" +- it is not a standard or protocol, but an architectural style +- advantage - helps maintain [quality attributes](#important-quality-attributes) like [performance](#performance), [availability](#availability), [scalability](#scalability) and [fault tolerance](#fault-tolerance) +- an api that obeys the rest architectural style is called a "restful api" +- the only actions a client can take in an rpc api is defined inside the interface definition language - so, it is somewhat static +- in rest, we can use hateoas - "hypermedia as the engine of application state" - the response contains "hypermedia links" around the operations that the client can perform +- rest should be "stateless" - no session information should be maintained by the server +- this way, each request is served in isolation +- advantage of statelessness - multiple requests by a single client can be processed by different horizontally scaled instances +- "cacheable" - the server can declare a response to be as cacheable or non cacheable. if a response is cacheable, the extra round trip to the server is avoided - the response is returned from the cache directly and our server is never even called +- this reduces response time and the load on server is reduced +- "resources" - resources are organized in a hierarchy in using the uri "uniform resource locator" +- a resource can be a "simple resource" or a "collection resource" +- resources can also have "sub resources" +- use nouns only for resources +- "resource identifiers" - should be unique +- for modelling actions, we use the http verbs +- so, unlike rpc, the only actions supported are crud - creating (POST), reading (GET), updating (PUT) and deleting (DELETE) a resource +- GET method is considered "safe" - does not change the state of the system +- GET, PUT and DELETE methods are considered "idempotent" - applying them multiple times will result in the same state change as applying them once +- GET requests are also considered cacheable by default +- the client can send the additional data using json (or xml) +- creating a rest api for a movie streaming service +- identify the resources - movies, users, reviews, actors +- map to uris - + - /users, /users/{user_id} + - /movies, /movies/{movie_id} + - /actors, /actors/{actor_id} + - /movies/{movie_id}/reviews, /movies/{movie_id}/reviews/{review_id} + +## Large Scale Systems Building Blocks + +### Load Balancers + +- if we run our application on multiple instances due to [horizontal scaling](#scalability), the client applications will have to know the ip addresses of all these instances in advance +- this results in tight coupling of clients to our systems, and makes it hard for us to make any changes +- advantages of load balancers - + - acts as a layer of abstraction between clients and our instances, so it looks like one server to the client + - distributes the load from clients among our horizontally scaled instances equally + - "autoscaling policies" - easily add / remove instances to the fleet based on requests per second, network bandwidth, etc, and all of this is hidden behind a single load balancer + - "fault tolerance" - load balancers can be configured with "health checks" to avoid sending traffic to unhealthy instances + - [rolling release](#rolling-deployment-pattern) - we can perform maintenance tasks easily by pulling down hosts one by one, and the load balancer would not direct traffic to these hosts +- types of load balancers - dns, hardware, software and global server +- "dns load balancer" - dns maps human friendly urls to ip addresses +- "dns record" is the response by "dns servers" when asked for ip addresses for a url +- can return multiple ip addresses in this record, ordered differently every time (maybe using round robin) +- the clients typically pick the first address from this list, and we achieve load balancing this way +- disadvantages + - dns servers do not perform health checks, so can return ips of faulty servers + - the dns record can be cached at client, which means they can call the faulty instance till the ttl - "time to live" expires + - exposes the ip addresses of our instances directly, thus exposing implementation details +- "hardware load balancers" and "software load balancers" address all the above problems with dns load balancers +- hardware load balancers run on hardware optimized for load balancers +- software load balancers can run on any general purpose machine +- all the communication is done through the load balancer, thus making our systems much more secure - in dns load balancing, it was happening directly once the client got the ip addresses +- they can monitor the health of our instances and only route traffic to the healthy instances +- they also allow for more advanced setup like take the instance type into account - some instances in our fleet might be more powerful than others, use more powerful techniques like current requests per second when load balancing the traffic, etc +- disadvantages + - typically, hardware and software load balancers are located close to the instances. so, if we run our load on multiple geographical locations called data centers, one group of the instances will have the load balancer located far away + - also, load balancers do not solve the "dns resolution" problem on their own - load balancers are again just an ip address, and we need to map it to a more human friendly url + +![hw and sw lb disadvantage](/assets/img/high-level-design/hw-and-sw-lb-disadvantage.svg) + +- "global server load balancer" - more intelligent than the typical dns load balancer +- it can redirect clients to the data center geographically closer to them, the location that will send a faster response time (this can be different from just using the geographical location due to number of hops), etc +- there is a load balancer deployed at each of the data center +- also, gslb can handle outages in one data center by not routing traffic to this faulty data center + +![gslb](/assets/img/high-level-design/gslb.svg) + +- open source software load balancers - haproxy, nginx +- cloud load balancers - aws elb, which has various types as well +- global server load balancer - route53 +- load balancers are also called "dispatchers" +- if using [microservices](#microservices-architecture), we can have a dispatcher for each micro service, and each microservice can be individually scaled +- below, we use load balancers both for communication from outside and internal clients + +![load balancing microservices](/assets/img/high-level-design/load-balancing-microservices.png) + +### Message Brokers + +- also called mom - "message oriented middleware" +- "synchronous communication" - both sides - client and server need to be healthy and maintain an active connection either with each other or via the load balancer - this is good when the server takes a short time to process and respond +- "message broker" - a queue data structure to store messages between senders and receivers +- message brokers helps with "asynchronous architecture" +- it entirely decouples senders from receivers - the sender does not wait for a confirmation from the receiver - it just puts the message onto the broker. this adds a lot of [fault tolerance](#fault-tolerance) - receivers can be down and still receive the events when they come back up. they also prevent messages from being lost. in synchronous systems, it can happen that the request / response is lost, and the client will never know which one it was, and it might retry, which would lead into further issues if the request is not idempotent +- e.g. the users see a success screen immediately after placing an order, while they get an email later if the order is placed successfully. this placing of an order involves a chain of services like order service, payment service, notification service, etc, and the client gets an immediate response with all this being handled behind the scenes +- message brokers are generally not exposed to outside world unlike load balancers +- it acts like a buffer when there is an increase in the load - assume we use synchronous communication - if there is a sudden spike, we will we will receive a lot of requests concurrently, which can result in our system crashing, dropping requests, etc. this is solved using asynchronous communication +- it can help with [load balancing](#load-balancing-pattern) - multiple instances can listen for an event and the message broker will send it to one of them +- it can also perform transformations on these messages, thus helping with [streaming analytics](#big-data) +- open source message brokers - rabbitmq, kafka +- cloud message brokers - sqs + +### API Gateway + +- we break our services into smaller services due to the [organization scalability](#scalability) +- the client will also need to now know about the different services - one service for fetching videos, another for fetching comments and so on +- api gateway helps with "api composition" - we compose all the different apis in all our services into one single api that the clients can interact with +- now, each service will need its own authentication and authorization. api gateway helps eliminate the duplication of auth logic - api gateway supports not only authentication but authorization as well +- we can have different apis for mobile vs desktop applications, and the client would be abstracted away from all this - [backends for frontends pattern](#backends-for-frontends-pattern) using user agent header +- api gateways can perform "ssl termination" - the traffic is encrypted between clients and api gateway, but decrypted between api gateway and servers +- api gateway can also implement "rate limiting" to prevent dos "denial of service" attacks +- without an api gateway, the client will make a call to fetch the home page, another call to fetch the video and finally another call for all the comments of this video. using "request routing", api gateway makes all the calls itself and sends the aggregated response to the client. this helps improve the performance a lot, since we are saved from these multiple requests going over the internet +- "static content and response caching" - caching to reduce response time for client +- it supports monitoring as well to not route traffic to unhealthy instances +- it can perform "protocol translation" - the api gateway exposes a rest api, while the underlying services use soap and xml, grpc + protocol buffers, etc +- considerations - api gateway can become a single point of failure - deploy multiple api gateways sitting behind a global server load balancer +- do not put business logic into api gateways +- open source api gateway - netflix zuul +- cloud api gateway - amazon api gateway + +### Note - Load Balancers vs API Gateway + +- [load balancers](#load-balancers) are only for balancing load among identical "servers" +- api gateway is the "public facing interface" that routes traffic to "services" and not "servers" +- so, a common pattern is that an api gateway routes traffic to load balancers, which can then route traffic to the individual servers +- apart from that - feature sets of both are different - + - load balancer is more around the [different routing algorithms](#load-balancing-pattern), performing health checks, etc + - api gateway is more around api composition, auth, request routing, protocol translation, throttling, caching, ssl termination, etc +- so, a load balancer might be enough for internal, individual services, while we might need an api gateway for public facing services + +![load balancer vs api gateway](/assets/img/high-level-design/load-balancer-vs-api-gateway.png) + +### CDN + +- cdn - "content delivery network" +- even with hosting on multiple data centers, there is significant latency between end user and server location +- first the 3 way handshake happens, then maybe the html is served and eventually all static assets like images are served +- this involves multiple network round trips and hops from the client to the server +- users do not wait for long for websites to load - they typically abandon it +- we can get the static content like htm, css, js, images and videos closer to our end users +- cdn is a "globally distributed network of servers" +- the servers are called "edge servers" +- the location the cdn servers are present at are called pop - "points of presence" +- page loads are faster now +- cdn also protects us against ddos attacks +- cdn also uses technologies that are more optimized, like using storage optimized for delivering static content, compressing using algorithms like gzip, minification of files, etc +- there are two strategies we can use - pull and push +- "pull strategy" - we tell cdn which content it should cache, and how often this should be "invalidated", which is configured by using a "ttl" property +- the first time, the cdn has to make the request to our servers to cache it +- however, subsequent requests are served by the edge servers of the cdn directly +- after the expiry, the cdn will send our servers a new request to check if the asset has changed, and accordingly refresh the cached asset +- disadvantages + - servers need to be available (first time or when ttl is reached) in order to serve the response + - first request after ttl is reached is slow +- "push strategy" - we publish the content to the cdn directly when the new version of the asset is available +- so, we typically do not set a ttl / set a very long ttl in this +- advantage - using the push strategy, the dependency on our servers to stay available is removed +- disadvantage - not desirable for frequently changing content, since it would require frequent invalidations and pushes from our end +- examples - cloudflare, amazon cloudfront + +## Data Storage + +### Relational Databases + +- refer [relational databases](/posts/relational-databases/) - tables, rows, columns, primary and foreign keys, etc +- advantages - + - perform flexible and complex queries using for e.g. joins + - remove data duplication by storing data efficiently + - intuitive for humans + - provides guarantees around [acid transactions](/posts/spring/#jpa) +- disadvantages - + - rigid structure enforced by schema, which requires planning ahead of time + - hard to maintain and scale due to guarantees around acid transactions - it can only be scaled vertically, not horizontally + - slower reads + +### Non Relational Databases + +- nosql databases - non relational databases +- solve drawbacks of [relational databases](#relational-databases) +- advantages - + - remove rigidity around schema - different records can have different sets of attributes + - eliminate the need for an orm - store data in a more "programming language friendly" and not "human friendly" way, by supporting structures like lists, maps, etc + - support much faster queries + - scale much more than relational databases, which is useful for big data like use cases - it can be scaled horizontally as well + - it follows base - + - basically available - never rejects the reads or writes + - safe state - can change data without user interaction - e.g. when performing reconciliation when there is deviation between replicas + - eventually consistent - we might get stale data +- disadvantages - + - does not support complex querying - operations like joins become hard + - acid transactions are not supported +- several types of non relational databases +- key value store - the value can be anything and it is opaque to the database - we cannot typically query on the value, only on the key. one use case - counters touched by multiple services. e.g. redis, amazon dynamodb +- document store - collections of documents, where documents have relatively more structure compared to a key value store - we can query on value. values are like an object. e.g. cassandra, mongodb +- graph database - an extension of a document store. helps establish relationship between records easily. use case - recommendation engine, social networks, etc. e.g. neo4j, amazon neptune +- we can also use nosql databases as a layer of cache in front of sql databases + +### Choosing the Right Database + +- redis - + - use cases - cache database calls, cache external service calls, etc + - these are key value stores +- s3 - + - used for assets like videos, images, etc + - typically backed by cdn solutions as well +- elasticsearch - + - built on top of apache lucene + - search using different fields of the entities + - supports fuzzy searching to help with typos - we can also configure the edit distance based on use case + - they are not meant to serve as primary sources of data - they should only serve searches +- influxdb - + - it is a time series database + - used for tracking application metrics like cpu utilization, throughput, etc + - it typically supports / is optimized for append only operations - it should not be used for frequently changing data + - read queries are performed in bulk - we query for the last few minutes or hours of data and perform aggregations on them +- cassandra - + - can handle massive amounts of reads and writes + - follows a no master / leaderless strategy + - the entire design of key value store comes in here + - so, horizontally scaling is as simple as adding more nodes + - these key value stores can make queries based on partition key easily - however, they cannot perform any complex searching + - this is a columnar db + - used for ever increasing data + - types of queries supported are mostly partition key based +- hadoop - + - used for data warehousing to perform analytics + - we can dump all of the data in a large database and support querying on this data + - used for offline reporting +- mysql - + - if we have structured information and we need acid transactions + - we want strong consistency + - use cases - inventory management, payment related, etc +- mongodb - + - this is a document db + - lot of attributes, non rigid schema + - variety of queries - optimized for json like structures + +### Improve Quality Attributes of Databases + +- three techniques - indexing, replication, partitioning +- "indexing" - speed up retrievals by locating them in sub linear time +- without indexing, retrievals would require a full table scan +- this is a [performance](#performance) bottleneck +- underneath, it uses data structures like + - hash maps - e.g. find all people from a particular city. city can be the key, while the value can be a list of row indices containing that city + - balanced b trees - e.g. find all people in a particular age range +- composite indexes - formed using a set of columns +- while the advantage is that reads speed up, disadvantages are + - more storage space is required + - writes become slower +- "replication" - already discussed in [fault tolerance](#fault-tolerance) for compute, same logic +- disadvantage - not trivial to maintain, more common in non relational databases than in relational databases +- "partitioning / sharding" - in replication, we copy the same data in all replicas. in partitioning / sharding, we split the data in different replicas +- now, we are not limited by the storage capability of one machine +- additionally with more storage, queries can now be performed in parallel on the different partitions, thus increasing the speed +- disadvantage + - route the query to the right partition + - avoid hot partitions, etc + - more common in non relational databases than in relational databases +- partitioning can be done for compute as well - e.g. traffic from paid customers go to more powerful machines unlike traffic from free customers + +### Brewer's CAP Theorem + +- in case of a network partition, a distributed database has to chose one of consistency and availability +- e.g. below, a user updates the value to 6 in a replica +- another user queries another replica. the replica then via intercommunication realized that the value has changed, and sends the updated 6 value to the user + +![cap theorem introduction](/assets/img/high-level-design/cap-theorem-introduction.svg) + +- "network partition" - e.g. due to some network issues, one replica is isolated from others +- now, the replica that is isolated has two options - + - favoring availability - return its local value, which may be outdated + - favoring consistency - return an error, asking to try again later +- note - this only happened when there is a network partition, otherwise, all three were guaranteed +- definitions below in cap theorem are a little bit different then what we saw for e.g. [here](#important-quality-attributes) +- "consistency" - read request receives either the most recent write or an error. this helps guarantee all the clients see the same value at the same time, regardless of the database instance they communicate with +- "availability" - every request receives a non error response, which can be outdated +- "partition tolerance" - system continues to operate despite an arbitrary amount of messages being lost over the network +- so, cap theorem states that we can only have two of the three things +- so, we already saw cp and ap, what about ca? +- we can have ca if we have no replicas - only a centralized database + +### Unstructured Data + +- unstructured data - does not follow any "structure" +- e.g. audio / video files etc - they are just a blob "binary large object" +- while both [relational](#relational-databases) and [non relational](#non-relational-databases) databases allow for storing of blobs, they are meant for structured, and not unstructured data. e.g. they impose size limits etc +- some use cases of unstructured data - + - users upload files like videos and images, which we need to process (e.g. transcode, compress, etc) + - relational / non relational database snapshots - these snapshots are unstructured data + - web hosting - static content + - huge datasets used for machine learning, e.g. readings from sensors +- two solutions for unstructured data - dfs and object storage +- dfs - "distributed file system" +- features / advantages - + - internally, can have features like replication, auto healing, etc + - looks like a familiar tree like structure (files within folders) to us + - works like file system - mounting on hosts etc + - we can modify files like we typically do when working locally, e.g. append logs to log files +- disadvantage + - cannot work with web for static content directly - will require a wrapper on top + - has limits on the number of files we can store i.e. the storage space has limits +- "object / blob storage" - scalable storage, but unlike dfs has no limits on how many objects can be stored +- stored in containers called buckets +- also, object storage allows "bigger" files compared to dfs - which makes them ideal for storing database snapshots +- they expose a rest / http api unlike dfs, which can be easily referenced by our static html pages +- they support "object versioning" - for a file system, another wrapper would be needed +- files are stored in a "flat structure" - not a tree like structure like in file systems +- the object has a name and a value associated with it, which is the actual content +- typically, object storage is broken into several classes, which offer different throughput and latency +- object storage uses replication too +- disadvantage - + - files cannot be opened and modified like we can when using dfs - we need to for e.g. create and upload an entirely new version + - cannot be mounted like file systems +- we can also run object storage services on our own storage, if cloud is not an option +- e.g. openio is such a solution +- s3 (simple storage service) is aws's object storage + +## Big Data + +- datasets are either very large in size or come at a very high rate for our system to be able to process +- the output of big data processing can be visualizations, data that can be queried, predictive analysis, etc +- "batch processing" - + - we store the data on distributed file system + - we then run jobs on it based on a schedule + - every time the job runs, it can either pick up the new data that was added to the system since the last time it ran, or it can process the entire dataset from scratch + - after processing, it can write the computed view to a database +- advantages of batch processing + - easy to implement + - more efficient than processing each event individually + - e.g. we push some faulty code. if our dfs still has all the original data, we can push the fixed code and run the job on the entire dataset again + - finally, we have visibility into historic data as well +- drawbacks - not realtime +- e.g. we would like logs and metrics to be analyzed realtime so that we can identify and debug production issues quicker +- so, we use "stream processing" + - the events come on a [message broker](#message-brokers) + - so it reacts realtime, not based on a schedule + - after processing, it can write the computed view to a database +- advantage - react immediately +- disadvantage - complex analysis cannot be done - fusing data from different times is very difficult / not possible - our computations can only use recent data +- going back to the same e.g. of observability systems, we would need historic data as well in anomaly detection +- so, we can use the "lambda architecture" - balance between batch processing, and stream processing +- it has three layers +- "batch layer" - follows the batch processing architecture. it takes all the data into account and typically overwrites its old output +- "speed layer" - follows the stream processing architecture. it helps fill the gap caused by events which came in since the last event that was operated on by the batch job +- "serving layer" - joins the outputs of the batch layer and speed layer and combines them into one + +![lambda architecture](/assets/img/high-level-design/lambda-architecture.svg) + +## Cloud Computing + +- cloud is mostly based on iaas - "infrastructure as a service" +- gives us access to virtually infinite compute, storage and networking +- we only pay for what we use / what we reserve, thus saving costs +- we can improve our scalability and reliability by deploying our software to "multiple regions" and "multiple zones" +- disadvantage of cloud computing - we do not have access to the infrastructure + +## Scalability Patterns + +### Load Balancing Pattern + +- synchronous communication - can be implemented using [load balancers](#load-balancers) +- asynchronous communication - can also be implemented via [message brokers](#message-brokers) +- note - load balancing != load balancer, so do not get confused +- note - message brokers are not exposed outside, so they cannot be used via client directly unlike [load balancers](#load-balancers) +- when using cloud, both load balancers and message brokers are built with redundancy and replication in mind to increase [fault tolerance](#fault-tolerance) +- there are various "routing algorithms" used for load balancing. we discuss three of them below - round robbin, sticky session and least connections +- "round robbin" + - the simplest / most common / default algorithm + - routes each request sequentially to the "next" worker instance + - disadvantage - only works when application is stateless - each request by a client can be handled in isolation by any one of the target servers. it will not work when an "active session" is maintained between a client and a server +- "sticky session / session affinity" - + - use cases - + - auth information of a client is stored in the session so that the client does not have to reauthenticate repeatedly + - client is uploading a very large file in parts. the different parts need to go to the same server for this to work + - requests from the same client are always sent to the same server + - this can be achieved using a cookie / by inspecting client's ip address + - disadvantage - this only works for smaller sessions - otherwise, the same server might end up with too many longstanding connections +- "least connections" - + - route the request to the server with least number of open connections + - so, it solves the problem we saw with sticky sessions + - use case - like sql, ldap, etc +- "auto scaling + load balancing" - most instances run a background process called "agent". it collects metrics around cpu consumption, network traffic, memory consumption, etc. based on these metrics, we can automatically "scale in" (decrease) / "scale out" (increase) the number of our instances. we can tie this to [load balancer](#load-balancers) as well, thus the load balancer would always be aware of the available ip addresses + +### Pipes and Filters Pattern + +- data flows from the "source" to "sinks" +- it encounters multiple "filters" along the way, which does only one thing, and is unaware of one another +- source examples - service that receives requests from users, readings from sensors, etc +- sink examples - databases, distributed file systems +- the pipes in between are typically message brokers + +![pipes and filters](/assets/img/high-level-design/pipes-and-filters.png) + +- if we put all the processing logic in one application, it will end up being a monolith +- we saw the disadvantages of a monolith [here](#multi-tier-architecture) +- by using different filters + - the throughput will increase, as well as different filters can perform different tasks + - each filter can be individually horizontally scaled + - we can use different technology for each filter based on the use case +- till now, we saw a "sequence of filters" that run on some data +- we can also have multiple such sequence of filters all running in "parallel" +- an example of all filters needed for a video streaming platform - + - split into chunks, so that the video can be downloaded in chunks instead of downloading it all at once + - select a frame from each chunk to act as thumbnails, which helps when we try to seek + - resize each chunk to different resolutions, which helps with "adaptive streaming" i.e. decide the quality of the video based on the client's bandwidth + - in parallel to all the filters above, another sequence of filters can convert audio into captions based on nlp etc +- filters should be "stateless" +- this pattern is not ideal if we want to run all the filters as a part of a transaction - performing a distributed transaction is very difficult + +### Scatter and Gather Pattern + +- the client sends a request to the "dispatcher" +- the dispatcher sends the request to the "workers" and gathers the result +- unlike [load balancing](#load-balancing-pattern) where the request is only forwarded to one instance, the request in this case is send to all workers +- each worker is independent of the other, and thus they can all operate in parallel +- throughout this pattern, the client is unaware of all this + +![scatter gather](/assets/img/high-level-design/scatter-gather.png) + +- the workers can be + - completely different services - for add recommendations, we request multiple services and then chose the best add for the user and show it to them + - same service with access to different data - e.g. one worker processes files 1 to 100, a second worker processes files 101 to 200 and so on. i think this is what is used in databases with sharding +- if one of the workers do not respond, we can aggregate the partial results from the other remaining workers +- we can also use a message broker in between the dispatcher and workers for decoupling. if it is not possible to return the result instantaneously, the dispatcher can instead send an id which the client can monitor + +### Execution Orchestrator Pattern + +- imagine we break a monolith into [microservices](#microservices-architecture) +- an extra "orchestration service" is used, which does not perform any business logic itself +- it performs complex flows by calling different services in the right order +- this is in a way like [scatter and gather](#scatter-and-gather-pattern), but here we have a sequence of operations - not one operation sent down to all the workers. again, unlike in scatter and gather where all operations could be performed in parallel, we may or may not be able do that here, as result from one service might be used as a request to another +- the orchestration service maintains all the intermediate state till it is able to construct and return the final result + - what if the orchestration service fails midway / or after performing the entire flow but just before sending the response? + - the orchestration service can store all its intermediate state inside a db, so that if the client re initiates the request, another orchestration service can pick up from where the faulty orchestration service left +- the orchestration service also has logic around handling exceptions and retries - e.g. [saga pattern](#saga-pattern) +- for high availability, we can also deploy the orchestration service in a horizontally scaled manner and have it sit behind a load balancer +- orchestration service != [api gateway](#api-gateway) - api gateways are meant to be dumb, while the orchestration service fully understands the context of a request +- best practice - the orchestration service is not meant for business logic - only for orchestration. the business logic is performed only by the various services sitting behind it + +![execution orchestrator pattern](/assets/img/high-level-design/execution-orchestrator-pattern.png) + +### Choreography Pattern + +- drawback of [execution orchestrator pattern](#execution-orchestrator-pattern) - changes in any of the services involves a change in the orchestration service +- this is called a "distributed monolith" - the orchestration service in the above example has become a distributed monolith because for e.g. multiple teams working on their own services might have to now change the orchestration service code together, again impacting [organization scalability](#scalability) +- instead, the orchestration service is replaced by a message broker +- a message is put onto the message broker, and the services can subscribe to this message as needed +- they can then also put more messages into the queue as a result of which other services can subscribe to them again +- this continues till the flow is complete +- since all this communication is asynchronous, all services are decoupled from each other +- even if one of the services is down, the flow can still continue and the relevant parts will still complete +- disadvantage - tracing the entire flow can become very difficult in case of issues, since we do not have a central orchestrator which was aware of all the steps during the entire flow + +![choreography pattern](/assets/img/high-level-design/choreography-pattern.png) + +## Patterns for Data Intensive Applications + +### Map Reduce Pattern + +- simplified processing pattern +- by google around 2004 +- we need to distribute the processing and huge datasets into several machines +- issues include - + - distributing the data + - parallelizing the workload + - scheduling execution on the different workers + - aggregating results + - recovering from failures +- solution - we model all problems using the map reduce model + - we pass the input data through map function, which outputs key value pairs + - then, the reducer receives all the values for a key, on which it can then perform some computation +- underneath, the map reduce framework takes care of all the issues we listed above - refer [this](/posts/hadoop/#theory) for the entire working of map reduce. e.g. [heartbeats mechanism](/posts/hadoop/#hadoop-2x) might be used to ensure the worker is running. if this fails, the task would be rescheduled on another worker +- if the master itself fails + - the process can be restarted from scratch again + - the master can take frequent snapshots, so when a new master is spun up, it can restore from where the faulty master left off + - a backup master can run alongside the primary master, which stays in sync +- map reduce is great for cloud because - + - we easily get access to a lot of compute and storage + - map reduce is batch processing - so we can run on demand and pay as we go, and not pay for extra compute + +### Saga Pattern + +- in [microservices](#microservices-architecture), we discussed how we should use [one database per service](#database-per-microservice) +- with one database per microservice, we lose out on the [acid transactions](/posts/spring/#jpa) +- so, saga pattern helps us manage consistency across microservices using distributed transactions +- if there is a failure in any of the microservice, a rollback is performed on the other microservices by applying an operation which has the "opposite effect" of the original operation +- saga pattern can be implemented using - + - [execution orchestration pattern](#execution-orchestrator-pattern) - the execution orchestrator decides whether to proceed with the transaction or rollback the transaction on the previous service with a "compensating operation" + - [choreography pattern](#choreography-pattern) - each service can either trigger the event for the next service if successful, or trigger the "compensating event" for the previous service if unsuccessful + +![saga pattern](/assets/img/high-level-design/saga-pattern.png) + +### Transactional Outbox Pattern + +- helps implement reliability in an event driven architecture +- e.g. a service needs to update something in its database and send a message to a message broker + - updating database and sending a message to a message broker is not an atomic operation + - so, if we for e.g. perform the database operation first, it might happen that the database is updated but the message is never sent to the message broker + - if we send the message first, the database might never be updated, but the message would have already been sent to downstream services +- extension of above - we can argue that with [at least once semantics](#message-delivery-semantics), we can always ensure that the message gets sent. issue - + - we successfully update the database and commit the transaction + - we fire the message + - our server goes down at this point - otherwise libraries of kafka etc are "intelligent" enough to resend the message if ack from broker is not received + - the message too gets dropped midway and does not reach the message broker +- to solve this, we use the "transactional outbox pattern" +- step 1 - it instead "as part of the same transaction" updates the actual data and inserts a new event ino an "outbox table". either both the update and the insertion of this new event will succeed, or both will fail, since both of these are a part of the same transaction +- step 2 - another service called "message relay service" polls this outbox table and puts any new entries in this table onto the message broker +- step 3 - it then either deletes the event or marks it as sent + +![transactional outbox pattern](/assets/img/high-level-design/transactional-outbox-pattern.png) + +- issue 1 - "duplicate events" - just before step 3, the message relay service crashes. it then comes back up and again performs steps 2 and 3. this situation is called [at least once delivery semantics](#message-delivery-semantics) +- solutions - + - the service logic is designed to be idempotent + - assume that the outbox table adds a unique id for every event, which the message relay service adds to the event it puts on the message broker as well. the consumer keeps track of the ids it has already consumed, and this way, it knows that when there is a duplicate event, it needs to discard it +- issue 2 - the database does not support transactions, e.g. non relational databases. step 1 of our solution relied on the fact that insertion into the outbox table and update to the regular tables can all be done under one transaction +- solution - instead add an outbox parameter to the object, which contains the list of events to be sent + ``` + { + "name: "...", + "outbox": [ + { ... } + ] + } + ``` +- now, the message relay service can poll all objects with this outbox parameter and after adding the messages onto the queue, it can remove the outbox parameter from these objects +- issue 3 - ensure ordering of events. e.g. user registers and then cancels, but we receive the cancel request first (which is dropped since no user is found), and then the registration is processed - which means the cancellation process was ignored altogether. so, ordering of events might be important based on use case +- for this, use a sequence id when storing events in the outbox table. this way, the message relay service will always put the messages onto the broker after sorting them using this sequence id + +### Materialized View Pattern + +- complex queries that involve different tables or maybe even different databases can be very slow - e.g. when we split our stack into microservices, the data is stored in different databases +- these complex queries also consume compute resources, thus increasing cost +- "materialized view" - a read only table is created with the data of the result +- consideration - additional storage cost +- two strategies to update - + - whenever the base tables get updated + - based on a schedule +- two ways to update - + - some databases support materialized views out of the box. most of such databases are efficient - they only take into account the modifications in the base tables, and do not recompute the entire materialized views from scratch + - we can programmatically compute this materialized view ourselves and store it in an optimized e.g. in memory database +- refer [cqrs + materialized view pattern](#cqrs-pattern) + +### CQRS Pattern + +- cqrs - "command and query responsibility segregation" +- divide service into two different services - + - "command service" - mutation of data - inserts, updates and deletes + - "query service" - reads data and returns to the caller +- these services have their own databases as well - the command database can be optimized for writes - e.g. using an sql database, while the query database can be optimized for reads - e.g. using a nosql database +- cqrs is useful when we have both frequent reads and frequent writes +- "synchronization" - to keep the command and query database in sync, we can either use a message broker, or a function as a service +- using a message broker (in red) - + - an event is published via a message broker by the command service which the query service can consume + - now, the command service could have put the event into message broker directly. but, to prevent loss of messages, we can use the [transactional outbox pattern](#transactional-outbox-pattern) +- using a "function as a service" (in green) - + - a function as a service is sitting between the command and query database + - it will only be triggered when there is a change in the command database + - once triggered, it will go and update the query database + - since it is a function as a service, it only runs when there are updates, thus saving us costs + - doubt - is this essentially the architecture for cdc tools like debezium? + +![cqrs](/assets/img/high-level-design/cqrs.png) + +- cqrs drawbacks - + - we can only guarantee "eventual consistency" between command and query database + - we have additional complexity for two different services and for the logic for synchronization between them +- cqrs + materialized view - + - e.g. when we split our stack into microservices, the data is stored in different databases + - this means complex services will have to hit different databases (via api calls to their services), which can be slow + - so, we use one query service which receives events from "multiple command services" (multiple command services is the key here), and it stores the combined materialized view for all these services at one place + - e.g. one command service for courses, one command service for reviews + - and one query service for the [materialized view](#materialized-view-pattern) that joins the data from both services for an enriched course view + +### Event Sourcing Pattern + +- typically, data in databases is the current state - modifications override the previous state with new state +- sometimes, we need all the events that led to a state - e.g. we need to show all the transactions for a user's bank account +- so, we only store events instead of the current state +- events are "immutable" - we can only "append" events, not change existing ones +- event sourcing has high performance for write intensive workload - in normal databases in case of write heavy workloads, there is a high contention due to concurrent updates for the same tables and rows. with event sourcing, each write is "append-only", which involves lesser locks +- to find the current state, we only have to apply or replay all the events +- we can also store the events in message brokers instead of storing them in databases, but querying message brokers is more difficult than querying databases +- now, replaying all events for all queries every time might not be efficient. so, we can take "snapshots" at certain periods. we still have all the history, but for deriving the current state, we only need the records since the last snapshot +- another popular pattern - cqrs + event sourcing + - the command service just puts the writes to the write events on to the message broker. it can even get rid of its own database + - the query service listens to these events and accordingly populates its e.g. in memory database with the snapshot we discussed about for faster reads + - another pattern being used here is [event streaming](#event-driven-architecture) + - remember - cqrs means eventual consistency + +![event sourcing + cqrs](/assets/img/high-level-design/event-sourcing+cqrs.png) + +## Software Extensibility Patterns + +### Sidecar and Ambassador Pattern + +- apart from performing the core functionality based on [features of the system](#features-of-the-system), a service needs to do things like collect metrics, send its log events to a distributed logging service, connect to a service registry for the most up to date ip addresses of its downstream services, etc +- all these functionalities are also "common" across all our services - so we would not want to repeat ourselves +- one solution - we implement all this as a library, which all our services use +- disadvantage - different services might be implemented using different languages. so we would need to support the library for different languages, which is a lot of overhead +- so, we instead use "sidecar pattern" +- the sidecar is "isolated" from the main process - the additional function is run as a separate process / container on the same server +- the communication between the two is also very fast, since they run on the same host +- since the two use the "same resources" like file system, cpu, memory, etc - the sidecar can report the value for these resources easily +- the sidecar can now be implemented in any language of our choice +- after making the changes related to business logic in the main application, we do not need to test the sidecar + +![sidecar pattern](/assets/img/high-level-design/sidecar-pattern.png) + +- "ambassador pattern" is a particular type of sidecar pattern +- in ambassador pattern, the ambassador acts like a proxy +- the service just sends requests to the ambassador, which then forwards these requests to the actual server, by handling things like authentication, [retries](#retry-pattern), [circuit breaker](#circuit-breaker-pattern), etc +- using the ambassador pattern also allows us to perform "distributed tracing" easily + +### Anti Corruption Adapter / Layer Pattern + +- when we migrate from an old monolith to a new set of microservices +- the new set of microservices need to temporarily interact with the old monolith till the migration is complete +- this means that code for old apis and protocols is scattered in the new microservices +- so, we deploy an "anti corruption service" in between, which performs the translation between the new microservices to the old monolith (both request and response, as needed, to and from both microservices and monolith) +- sometimes, the anti corruption layer can be "temporary" or sometimes "permanent" when we cannot get rid of some parts of the legacy system - e.g. downstream services use the legacy application for reporting and are not ready for a migration yet + +![anti corruption adapter layer pattern](/assets/img/high-level-design/anti-corruption-adapter-layer-pattern.png) + +### Backends for Frontends Pattern + +- usually, we have a separate backend in front of our microservices, to serve the frontend +- now, the frontend just has to interact with this one backend, which performs the logic of relaying the request to the right microservice +- now, assume we have to support multiple frontends like desktops vs mobiles. they tend to interact with the api differently - + - e.g. mobile screens have lesser real estate so display lesser data than desktops + - mobile devices have lesser resources (ram etc) compared to desktop + - mobile app owners might want additional features like scanning barcode, only want products available in a particular location, etc +- now, our backend service starts to become a monolith - it has to support the additional features for the desktop, mobile app and the shared features between the two +- so, we use the bff or "backends for frontends pattern" +- we use a separate backend for each frontend +- each backend now stays slim, and allows its frontend to make use full use of its feature set +- we can now scale each backend individually as well - more server side computation might be needed for mobile apps then for desktops +- how to implement the shared functionality in these backends, e.g. login and register + - use a shared library - this pattern usually does not scale well, because - + - any change in this shared library affect all the backends that use it + - there is also often a "lack of ownership" with such shared libraries + - spin up another common service called a "shared backend service" +- the "user agent" header in requests helps us tell the device a request is coming from, and by placing an api gateway in front of these backends, we can decide which backend to route the request to based on the device type + +![backends for frontends pattern](/assets/img/high-level-design/backends-for-frontends-pattern.png) + +## Reliability, Error Handling and Recovery Patterns + +### Throttling and Rate Limiting Pattern + +- e.g. one client bombards our systems with multiple requests. this leads to high cpu and memory utilization of our resources. thus, our response time increases / services become unavailable, and we would be unable to serve other clients, thus violating our sla etc +- using "throttling and rate limiting", we set a limit on the number of requests in unit time / bandwidth (amount of bytes) in unit time +- "server side throttling" - we are the service providers and would like to limit our systems from over consumption +- server side throttling use case - we can have different grades of customer - premium and so on, and we would like different limits for these different customers +- "client side throttling" - we are calling external services and would like to set limits on the number of calls made to such services +- client side throttling use case - we can throttle requests for different services at different levels, based on their quotas +- we can handle this using different strategies - + - "drop requests" - status code is 429 (too many requests) + - "slow down the service" - queue the requests in a queue and process them later + - "degrade the service" - e.g. a video streaming platform can reduce the resolution of the video + +### Retry Pattern + +- we retry the same operation in case of a failure +- we should retry only if the failure is temporary and recoverable - e.g. not a user error like unauthorized, bad request, etc +- if the request succeeds on a retry, we were able to hide the internal issues from the user successfully +- so, we need to pick the right "delay" and the right "backoff strategy" for this delay + - "fixed delay" - the delay between subsequent requests stays same - 100, 100, 100, 100 + - "incremental delay" - the delay between subsequent requests increases linearly - 100, 200, 300, 400 + - "exponential backoff" - the delay between subsequent requests increases exponentially - 100, 200, 400, 800 +- we can add "jitter" - a random delay between these delays +- e.g. for incremental delay, instead of calculating delay using i * 100, we do i * (100 + random(-15, 15)) +- reason - clients might end up retrying at the same time, thus causing the retry storm. this jitter helps prevent the retry storm +- "retry storm" - some instances of the service were unhealthy, we bombarded the remaining instances with our retry requests and made the entire service unhealthy +- apart from the backoff strategy and delay, we can also configure how many times to retry / how long we should keep retrying for +- note - the operation we retry should be idempotent - the client will not know if the request was lost or the response - and if it was the response that was lost, we cannot retry a non idempotent operation +- retry pattern can be configured in the [ambassador pattern](#sidecar-and-ambassador-pattern) or implemented via popular libraries + +### Circuit Breaker Pattern + +- we were able to recover from temporary and recoverable issues using the [retry pattern](#retry-pattern) +- retry pattern is optimistic, while circuit breaker is pessimistic +- if the errors go above a certain threshold, the circuit breaker does not even allow the requests to go through +- this way, we save on resources and time from calling a service which might anyway be down +- after being in the open state for some time, the circuit breaker automatically goes into the "half open state" +- it allows a small percentage of requests to go through +- if they succeed, the circuit goes back into closed state + +![circuit breaker pattern](/assets/img/high-level-design/circuit-breaker-pattern.png) + +- we can either drop the requests, or save it in one place to be retried later. this approach is called "log and replay". it might be needed for requests that are not just simple get requests, but require calling a mutation endpoint on another service +- we should configure different circuit breakers for different services +- we can also replace the half open state with "asynchronous pings / health checks" to the service - once the health checks start passing, we can mark the circuit as closed. we get rid of the half open state in this technique +- this too can be configured in the [ambassador pattern](#sidecar-and-ambassador-pattern) or implemented via popular libraries + +### DLQ (Dead Letter Queue) Pattern + +- helps handle errors involving [message brokers](#message-brokers) + - producer error - the producer cannot put the message on the broker because the queue is already full, the message is too big, etc + - consumer error - the consumer cannot process the message due to some data discrepancy +- so, we introduce another special topic or queue called the "dead letter queue" +- two strategies - + - so, both the producer and consumer on encountering an error move the message to the dead letter queue themselves + - the message broker itself is configured to move messages to the dead letter queue + - producer errors can be identified easily by the message broker - queue is full, message is too big, etc + - for consumer errors, if the message would not be consumed for a long time, message brokers can conclude that the messages are not getting acknowledged, and it can move these messages to the dlq +- best practices - + - add the reason e.g. stacktrace to the message headers before moving the message to the dlq + - use aggressive monitoring and alerting for messages in the dlq + +## Deployment and Production Testing Patterns + +### Rolling Deployment Pattern + +- when deploying a newer version of our application to servers, we bring down the servers during a "maintenance window" +- sometimes, we might not be able to bring our servers down entirely, e.g. during an emergency release, which is not during the maintenance window +- steps - + - stop the load balancer from forwarding traffic to one server + - upgrade the application on this one server + - run some tests on this new version if needed + - allow the load balancer to send traffic to it again +- keep redoing this one after another till this is done for all the servers + +![rolling deployment pattern](/assets/img/high-level-design/rolling-deployment-pattern.png) + +- this way, our application is always up +- when releasing, if we notice any issues / errors, we can follow the same set of steps to perform a rollback +- advantage - + - no extra cost for hardware + - most widely used due to its simplicity +- drawbacks - + - it can result in "cascading failures" e.g. suppose the new servers start failing. now all the traffic will go to the old servers, which can inturn start failing as well due to "overload". now, this brings down our entire service + - if the new version is "incompatible" with the old version, there might be issues - e.g. db schema changes + +### Blue Green Deployment Pattern + +- "blue environment" - we keep the old version of our servers running as is throughout the release +- "green environment" - we deploy the new version of our servers to this environment +- we carry out tests on the green environment +- if the tests etc run fine, we shift the load balancer to point to the green environment +- if we see a failure at any point, we can shift the load balancer back to point to the blue environment +- finally, we can terminate the blue environment once we are done + +![blue green deployment pattern](/assets/img/high-level-design/blue-green-deployment-pattern.png) + +- advantages - both disadvantages of [rolling deployments](#rolling-deployment-pattern) - + - both environments have an equal number of servers, so the issue of cascading failures is prevented + - we can only run a single version of our software at a given moment, so the issue of incompatibility is prevented +- disadvantage - both advantages of [rolling deployment](#rolling-deployment-pattern) + - extra cost for hardware + - complicated to implement + +### Canary Testing and A/B Testing Deployment Pattern + +- "canary release" - borrows patterns from both [rolling deployment](#rolling-deployment-pattern) and [blue green deployment](#blue-green-deployment-pattern) +- we deploy the new version of the application to a small set of "existing" servers (instead of one by one to all existing servers like in rolling deployment) +- it is considered safer than rest of the deployment patterns because - + - for canary release, the performance etc is monitored for much longer than in other patterns + - only beta users get the traffic to the new servers - this can be done by the load balancer for e.g. by inspecting the origin header +- "ab testing / deployment" - ab testing works just like canary release +- however, in this case, we deploy with the motive of rolling back to the old version +- use case - we test the new feature and how it performs, but are not fully ready with them yet to go into full scale production +- sometimes, the users who are a part of this ab testing do not even know about it - they might be seeing new features and can be asked for feedback about it. this helps with genuine feedback + +![canary testing](/assets/img/high-level-design/canary-testing.png) + +### Chaos Engineering + +- "chaos engineering" deliberately injects random failures into our production systems +- it helps us find single points of failure, performance bottlenecks, etc +- advantage - + - system becomes more reliable with time + - development team becomes more proficient in monitoring and debugging production issues +- the types of failures we can inject - terminate random services, inject random latencies, etc +- e.g. of a tool - chaos monkey by netflix + +## Multi Tier Architecture + +- organize system into multiple "physical" and "logical" tiers +- "logical separation" - different tiers handle different concerns +- "physical separation" - allows each tier to be separately developed, scaled and upgraded +- multi tier != multi layer architecture +- multi layer is when the same application is broken into different modules +- however, it will still run as a single unit during runtime and will be a single tier architecture +- in a multi tier architecture, the different tiers run on different machines altogether +- restriction - communication cannot be skipped between tiers. this helps keep the tiers loosely coupled + +![multi tier constraint](/assets/img/high-level-design/multi-tier-constraint.svg) + +- most common architecture - "three tier architecture" +- tier 1 - "presentation tier" - the ui on web browser, mobile app, desktop gui, etc +- it takes input from the users / shows them the relevant output +- it does not contain business logic +- tier 2 - "application tier", "logic tier", "business tier" +- it has all all the business logic based on the [features of the system](#features-of-the-system) +- tier 3 - "data tier" - responsible for storage and persistence +- it can contain files and or database +- this three tier architecture fits most use cases +- it allows for easy horizontal scaling +- tier 1 does not need any scaling since it runs on user devices +- tier 2 can run behind a load balancer and be scaled easily if it is stateless +- tier 3 can also be scaled well using techniques like partitioning and replication discussed [here](#improve-quality-attributes-of-databases) +- drawback of three tier architecture - tier 2 becomes a monolith +- monolith drawbacks + - high resource (cpu and memory) consumption + - harder to maintain codebase + - a fault can result in the entire system being down +- so, three tier architecture is good for companies who have a small codebase +- "two tier architecture" + - tier 1 - has both ui and business logic + - tier 2 - data tier +- "four tier architecture" - a new tier in the three tier architecture is introduced between tier 1 and tier 2 for [api gateway](#api-gateway), to address caching, security, etc + +## Microservices Architecture + +- recall [monolith drawbacks](#multi-tier-architecture) - high resource consumption, hard maintainability, lack of fault tolerance. microservices removes all these drawbacks - + - "independently deployable" + - each service can be easily scaled horizontally + - unlike monoliths which would relatively be much more resource intensive, microservices are much more efficient to scale and maintain + - we can make the right choice for tech stack for each microservice based on use case + - "loosely coupled" - helps with [organization scalability](#scalability) by breaking down codebase. now, a small team is responsible for this codebase + - helps with [fault tolerance](#fault-tolerance), since faults are now scoped to a smaller component +- disadvantage - + - overhead increases around testing, debugging issues, etc + - latency increases - more so if we do not ensure loose coupling when [decomposing the service](#migration-to-microservices) + - most important - distributed transaction management is much harder when compared to using a single database + +## Migration to Microservices + +- e.g. assume we have a [three tier architecture](#multi-tier-architecture) for our e commerce application, and would like to split the second tier into [microservices](#microservices-architecture) +- 3 principles to follow when creating microservices - + - "cohesive" - elements that are tightly coupled to each other should stay together inside the same microservice, so that each microservice can be developed and maintained independently + - srp or "single responsibility principle" - a microservice should only do one thing. this removes "ambiguity" around which microservice should own what piece of functionality + - "loosely coupled" - there should be minimum communication required between different microservices, and a microservice should be able to do its task independently +- size of a microservice does not matter, it is the 3 principles above that should influence decisions +- popular decomposition techniques - + - "business capabilities" - identify what provides value to business. take stakeholders pov + - "domain / subdomain" - also called "domain driven design" - instead of looking at it from a business side, we take the developers pov in this. types of domains / subdomains are - + - "core" - key differentiator / features of system + - "supporting" - integral in delivering the core capabilities, but not a differentiator - e.g. shipping + - "generic" - not specific to any business - can even be bought off the shelf - e.g. payments +- "incremental and continuous" approach should be used - + - identify the parts which will benefit the most from this migration + - parts requiring frequent changes - most important + - parts that have scalability issues +- "strangler fig pattern" - + - we keep a "strangler facade", which can be implemented using an api gateway, that sits between clients and our backend systems + - now, the api gateway initially routes requests to the monolith + - when the microservice is ready, the api gateway is switched to route requests to the microservice instead + - we can also use [canary testing / ab testing pattern](#canary-testing-and-ab-testing-deployment-pattern) here to direct a part of the traffic to the new decomposed microservices and slowly increase this percentage + - finally, the components of the microservice are removed from the monolith +- because of our incremental and continuous approach, the monolith keeps getting smaller and smaller +- original monolith should have a good "test coverage", to ensure this split does not break anything + +![strangler fig pattern](/assets/img/high-level-design/strangler-fig-pattern.png) + +## Microservices Best Patterns + +### Database Per Microservice + +- if we use the same database across different services, it results in "tight coupling" - recall that one of the principles of microservices was loose coupling +- e.g. if the schema changes due to one microservice, this change needs to be propagated to other microservices that use the same database as well +- if we use a database per microservice, each microservice owns its data and does not expose it to any other service +- the database of another microservice cannot be accessed directly - they have to go through the api of the owning microservice +- advantage - we can chose the database optimized for the workload of the microservice +- downsides - + - "added latency" - sending an additional request to the microservice and parsing the response is slower than accessing the data directly. to prevent the overhead of communication, we can cache the response of the responding microservice in the requestor microservice. however, this caching makes our system "eventually consistent" from "strictly consistent" + - cannot perform joins as easily now, since data is spilt across databases - solved by [cqrs](#cqrs-pattern) + - we lose out on acid transactions - performing a distributed transaction is very hard - solved by [saga](#saga-pattern) + - "data duplication" - data is now duplicated across microservices - e.g. product information might be duplicated in orders service + +### DRY Principle + +- dry - don't repeat yourself - we should not repeat ourselves +- this way, we only need to change the logic in one place +- by this logic, we might want to package the repeated logic of microservices into a shared library +- but, this is not a good practice - dry does not hold for microservices +- sharing a library introduces "tight coupling" - recall that one of the principles of microservices was loose coupling +- because for e.g. if a team makes changes to the shared library's apis, these changes need to be communicated to the other teams as well +- another drawback - "dependency hell" + - e.g. a microservice uses v1 of a library directly + - its shared library uses a different version (v2) of the same library + - now, the microservice needs to upgrade the library to v2 because of the shared library, retest the changes, etc +- solutions - + - we can increase the boundary of some microservice to include this shared logic, and other microservices call this microservice for the same + - we can spin up a new microservices containing that shared logic + - we can use the [sidecar or ambassador pattern](#sidecar-and-ambassador-pattern) as well for e.g. for observability +- note - shared libraries is a good pattern for sharing data models - request and response dto +- for this, we have techniques like code generation tools that can generate implementations for all languages based on an "interface definition" + +### Structured Autonomy + +- myth - teams can chose their own tech stack, databases, tools, etc +- doing things differently in different microservices around building, testing, maintaining codebase, etc introduces a lot of overhead +- autonomous is allowed but under certain boundaries, hence the term "structured autonomy" +- tier 1 - "fully restrictive" - should be uniform across the whole firm - e.g. monitoring and alerting, ci / cd, etc +- tier 2 - "autonomy within boundaries" - database technologies +- tier 3 - "complete autonomy" - e.g. release process + +### Microfrontends + +- we can split the monolithic frontend just like we [split microservices](#microservices-architecture) - based on domain / subdomain or based on business capabilities +- each microfrontend is an spa +- all these microfrontends are assembled inside a "runtime container" +- the runtime container can also handle things like authentication / authorization +- now, each microfrontend has its own ci cd and can be released independently +- best practices - + - microfrontends should be loaded at runtime, and not as compile time dependencies, otherwise the release schedule etc would still be tied to each other + - sharing state should not be done - it is equivalent to [sharing a database in microservice](#database-per-microservice). we should instead use custom events, pass callbacks, use address bar, etc + +## Event Driven Architecture + +- three actors are involved - producer, consumer and event +- use event driven architecture when we can classify actions as "fire and forget" / "asynchronous" +- events are immutable +- events can be stored indefinitely in our system (unlike requests in synchronous communication) +- unlike in the "request response model", where the sender needs to be aware of receiver's api, data models, url, etc. in event driven architecture, the publisher does not care and is not even aware of its consumers. this helps achieve "decoupling", [one of the principles in designing microservices](#migration-to-microservices) +- refer [message brokers](#message-brokers) for more points, advantages and examples of this approach +- 2 event delivery patterns are supported - event streaming and publish / subscribe +- "event streaming" + - the message broker acts like a permanent storage + - the consumer can view any number of past events based on use case + - optionally, the message broker can remove the events from storage after a period of time +- "publish / subscribe" + - the message broker acts like a temporary storage + - only new events from the point the consumer joins are visible +- allows for implementing patterns like [event sourcing](#event-sourcing-pattern), [cqrs](#cqrs-pattern), [saga](#saga-pattern) + +## Message Delivery Semantics + +- failures can happen during multiple stages - (draw quickly in interview using arrows) + - the producer sending the message to the broker fails + - the producer sending the message succeeds but the acknowledgement from the broker fails + - the message broker sending the message to the receiver fails + - the receiver receiving the message succeeds but the processing fails + - the receiver processing succeeds but the acknowledgement to the broker fails +- this is what the "message delivery semantics" help addressing +- "at most once delivery" - + - the producer does not wait for acknowledgement from broker - so, if the message is lost from producer to broker, we loose the event + - the consumer sends the acknowledgement immediately to the broker before starting its processing - so, if the consumer crashes after receiving the event, we loose the event + - use case of at most once delivery - when we are fine with data loss + - we can extrapolate lost events - e.g. location updates in a ride sharing service + - advantage - at most once delivery has the least latency and cost +- "at least once delivery semantics" - + - the producer will resend the event if the acknowledgement is not received - so, it can result in duplicate events if the message is received but the acknowledgement is lost + - consumer sends the acknowledgement to the broker only after successfully processing the event - so, if the consumer crashes after processing the event and before the acknowledgement, it can result in duplicate events + - use of at least once delivery - data loss is not acceptable + - e.g. reviews can be overridden if received multiple times + - disadvantage - more latency, e.g. broker and producer need to wait for acknowledgements etc +- "exactly once delivery" - + - very difficult to achieve + - we generate a unique id / the message broker does this for us automatically + - then, the message broker checks if it has already received this id in the past by checking its log + - the consumer needs to check in its database if the event with this id has already been processed, and accordingly handle the event + - understand that the consumer can still receive the message multiple times like in "at least once delivery". however, our consumer code logic is smart and if it sees a duplicate, it simply ignores it and sends an acknowledgement, thus avoiding processing the event multiple times + - so, my understanding - exactly once from producer to message broker might be guaranteed by message broker, but message broker to consumer needs to be guaranteed by us? + - e.g. processing of payments need to happen exactly once + - note - kafka guarantees exactly once when transferring data between kafka topics +- so, my final understanding - + - for ensuring that the message reaches the broker from the producer, use [transactional outbox pattern](#transactional-outbox-pattern) + - for ensuring that the message reaches the consumer from the broker, use at least once delivery semantics + - to ensure exactly once, maintain the processed ids of events in the consumer database + +## Testing + +- unit test - + - test a class / method / module in isolation + - advantage - cheap to maintain, fast to execute + - we should have a lot of unit tests + - disadvantage - give the least confidence about overall system +- integration test - + - verify the different systems we integrate with, e.g. databases, message brokers, etc + - disadvantage - run slower + - we should have fewer integration tests + - give more confidence about our system +- functional / end to end test - + - run on the entire system + - works from an end user perspective - so each test should test the entire user journey + - very slow to run + - we should have very few end to end tests + +![testing pyramid](/assets/img/high-level-design/testing-pyramid.png) + +- in microservices, for integration tests, we can use "lightweight mocks" for our upstream services +- disadvantage - mocks will not help us identify changes to the api of the actual upstream services +- so, we can use "contract tests" alongside the integration tests +- the idea is that the downstream service saves the results of its integration tests (the requests and responses it expects) in a contract file +- these tests are then run on the actual upstream service - the requests are replayed, and the responses are asserted using the expected response of the downstream service +- contract testing can be used for asynchronous communication as well - the downstream service tells the message it expects, and the upstream service asserts that the message is triggered when the appropriate functionality is called +- so, contract tests are basically a great addition / alternative to integration tests by themselves in microservices +- e.g. spring cloud contract +- if our company cannot afford functional / end to end tests, we can directly test in production + - using [blue green deployment](#blue-green-deployment-pattern), we can test in the blue environment before we switch traffic of the load balancer from blue environment to green environment + - [canary testing](#canary-testing-and-ab-testing-deployment-pattern) + +## Network Protocols + +- "application layer protocol" - two methods are there - + - "client server protocol" - e.g. http, ftp, smtp, websockets + - everything in client server protocol (including websockets) uses tcp + - http - follows a request response model. the client sends a request, while the server returns a response + - websockets - + - client and server have a bidirectional full duplex communication + - note - websockets are not the same as peer to peer - clients can talk to server, but clients cannot talk with each other + - it is an alternative to inefficient continuous polling using the request response model + - "peer to peer protocol" - e.g. web rtc (realtime communication) + - all can talk with each other - even clients can talk to each other + - this makes it fast, since messages need not be "relayed" via the server + - web rtc uses udp, which also makes it fast +- "transport / network layer" - + - tcp - + - transport control protocol + - a single (virtual) connection is maintained + - on this connection, all packets are sent one by one + - maintains an ordering of packets + - receiver sends acknowledgements for every packet + - udp - + - user datagram protocol + - no connection as such is maintained + - packets can be sent in parallel + - no concept of ordering + - this makes it less reliable than tcp + - but, this also makes it faster than tcp + - use case - live streaming - if we miss some bits of a live video call, we will not rewind back to listen what we missed + +## Caching + +- store frequently accessed data in fast memory rather than accessing it every time from slow memory +- it helps reduce latency +- important - it also helps achieve "fault tolerance" +- there are different places where data can be cached - client side (on browser), cdn, api gateway / load balancer and application caching at server side (the focus of this section) +- this application cache (e.g. redis) sits between our server and database +- distributed caching - imagine we had only one cache server - it would become a single point of failure. so, we use consistent hashing technique to help scale the cache server easily - now, based on the key that our application server uses, the request would be directed to the right underlying cache server automatically +- the 5 strategies have been discussed below +- vvimp - all strategies below can be explained nicely if using sequence diagrams + +### Cache Aside Strategy + +- if cache hit, return +- if cache miss - + - application to cache - miss + - application to db to fetch data + - application to cache to populate cache + - application returns response to client +- the application continues to work even if the cache goes down - it falls back to database +- our current strategy does not interact with the cache for db writes, only reads. this results in following problems - + - for new data, there will always be a cache miss first + - inconsistency - we do not invalidate cache for updates, so updates to our data will not be reflected in the cache +- note - here, our application server has the logic for interaction with cache - so, we can also modify the data being stored in cache based on our needs to optimize it + +### Read Through Strategy + +- same as [cache aside strategy](#cache-aside-strategy) +- however, now the cache interacts with the database, and we do not get a chance to modify the data being stored in the cache i.e. the data inside cache would be the same as the data inside database +- how cache miss works - application will never know if it was actually a cache miss or a hit - + - application to cache - miss + - cache to db to fetch data + - cache populates itself + - cache returns response to application + - application returns response to client + +### Write Around Strategy + +- when writing data to the database - invalidate the cache for the key +- the application removes the key from cache / marks the dirty flag as true for this document +- it is used alongside [read through strategy](#read-through-strategy) or [cache aside strategy](#cache-aside-strategy) +- it basically solves the inconsistency problem we had there + +### Write Through Strategy + +- first write into the cache, and then write the same thing into the database +- "2 phase commit" - we need to ensure that both the operations are performed inside a single transaction - either both pass or both fail +- this too is used alongside [read through strategy](#read-through-strategy) or [cache aside strategy](#cache-aside-strategy) +- advantage over [write around strategy](#write-around-strategy) - fetches for new data would not result in a cache miss - write around only solved the inconsistency problem, not this problem +- drawback - our system is now less fault tolerant - if either db or cache goes down, our application will go down + +### Write Back (or Behind) Strategy + +- unlike [write through strategy](#write-through-strategy) where we synchronously write into the database for a successful write, we asynchronously put the write into a queue after updating the cache in this case +- the data gets written into the database from the queue into the database eventually +- advantage - if our system is write heavy, it helps buffer writes into the database +- it also adds a lot of fault tolerance to our system - we no longer depend on the availability of database +- one failure scenario and its solution - + - a write operation is performed - write is performed on the cache and is put onto the queue + - the db is down for 5 hrs / the db is already performing writes which will take another 5 hrs to complete - so the message just sits in the queue + - the cache ttl is 3 hrs - so after 3 hrs, it tries to fetch the data from the database again + - now, since the write has not been processed by the database yet, it will not return this record to the cache, and our system will think that the data does not exist in the first place + - solution - make the ttl of cache higher + +## Transaction + +- database transactions should be "acid" compliant + - "atomicity" - either all operations are completed successfully or they are all rolled back + - "consistency" - database should go from one consistent state to another. e.g. - + - some operation should not lead to a state where for e.g. the payer's balance has reduced but receiver's balance has not increased + - operations should not violate "integrity constraints". recall - key, domain, entity, referential + - "isolated" - concurrent transactions do not interfere with each other - one transaction will not see the "intermediate state" of another transaction + - "durable" - results are persisted to disk so that they can withstand system failure +- "commit" - make operations of the transaction complete +- "rollback" - revert all changes caused due to a transaction +- "save point" - allows us to rollback parts of transactions +- when executing a transaction, "database locks" are used to lock either tables or rows depending on the database implementation +- so, transactions should be small, since they consume a lot of locks +- when something is locked, other operations will wait on this lock for it to be released +- these concepts work when transactions are local to a particular database. for distributed systems, we can use - + - [2 phase commit](#2-phase-commit) - popular + - [3 phase commit](#3-phase-commit) - not much used due to complexity + - [saga pattern](#saga-pattern) - popular +- both 2 phase and 3 phase commit are said to be "synchronous", while saga pattern is "asynchronous" - because the locks are not held in saga pattern +- typically, saga pattern is used for long transactions, when it is not feasible to keep the locks for such long periods and block all other operations + +## 2 Phase Commit + +- there are two phases - + - voting / prepare phase + - decision / commit / abort phase +- we have a "transaction coordinator" +- all the microservices participating in this flow are called "participants" +- note - before performing any operation - both actors - transaction coordinator and participants write the operation to their local file - i think this is like "write ahead log". before performing any request / any acknowledgements, this file is updated first +- this way, if any of them go down, they can read from this file when they come back up +- voting / prepare phase - all the services are notified about the update to perform. at this point, the services obtain relevant locks for this update and respond with an ok +- if for any reason this operation cannot be performed - e.g. "order service" is the transaction coordinator, but the participant "inventory service" responds that there is not enough stock - then the service responds with a not ok +- based on the response from participants from earlier phase, the transaction coordinator asks all participants to commit / abort the transaction +- disadvantage of 2 phase commit - coordinator service is the single point of failure +- if for some reason the transaction coordinator goes down after the participants have obtained locks, the other transactions performed on the participants would be stalled because of this lock. the lock would be held till the transaction coordinator comes back up and responds + +![two phase commit](/assets/img/high-level-design/two-phase-commit.png) + +## 3 Phase Commit + +- same as [2 phase commit](#2-phase-commit), except that the commit phase is broken into two parts - + - pre commit phase + - commit phase +- during the pre commit phase - the transaction coordinator only sends the decision of commit or abort - this operation is not performed +- the actual commit / abort is only performed the next phase - the commit phase +- now in this pattern, unlike in 2 phase, there is intercommunication between the participants +- this way, if there is a failure at either the pre commit or the commit phase, the participants can make decisions - e.g. if any of the participants had received the commit message from the coordinator, it means that the rest of the participants can also kick off the commit phase + +## Database Indexing + +- data pages - + - internally, data is not stored as tables - that is just a representation + - it creates data pages - generally 8kb i.e. 8192 bytes in size + - it has three parts - + - header - 96 bytes - metadata like page number, free space, checksum, etc + - data records - 8192-(96+36) = 8060 bytes - holds the actual data records + - offset - 36 bytes - using an array, each index stores a pointer to the corresponding data record in the data records section described above + - e.g. if a row is 64 bytes, one data page can store 8060/64 = 125 table rows + - so for storing one table, the underlying dbms will manage multiple data pages +- data blocks - + - data pages ultimately get written to data blocks + - a data block is a section in the actual underlying physical memory that can be read from / written to in one i/o operation + - the dbms does not have control over the actual data block, only data page + - a data block can hold one or more data pages + - so, the dbms maintains a mapping of what data page is stored inside what data block +- indexing - it is used to increase the performance of database queries. without indexing, the database would have to - + - load all the data blocks one by one + - go through all the data pages in this block one by one + - go through all the data records this page one by one +- b+ trees - + - databases instead use b+ tree to help achieve logN time, instead of the n described above for crud operations + - b trees vs b+ trees - in b+ trees, the nodes at the leaf node level also maintain links to each other, unlike in b trees + - m order tree or m ary tree means means a node can have m - 1 keys and m pointers + - this tree maintains the sorted property + - the tree is always height balanced + - the actual values are always in leaf nodes + - the values in all other intermediary nodes just help with traversing the tree / reaching the leaf nodes quickly + - right is greater than or equal to, left is strictly lesser than + - notice how the leaf node level is like a sorted array + - the key is the value of the node, which helps us with efficient traversal + - alongside this, an additional pointer is stored in every (mainly leaf) node as well, which points to the actual data page + - now, using the data page to data block mapping, we can fetch the right data block + +![b+ tree](/assets/img/high-level-design/b+-tree.png) + +- types of indexing present in rdbms - clustered indexing and non clustered indexing +- clustered indexing - + - what order the original b+ tree is constructed in is determined by the column we use for clustered indexing + - this is why only one clustered index is allowed - because it affects how the original b+ tree is constructed + - the records in the "data records" section of data page may be jumbled - they are ordered according to insertion time + - however, we want them to be ordered based on our indexed column + - so, we use the offset field - recall that offset is an array + - assume our id insertion order is 1 4 5 2 + - the offset would look like this - (pointer to 1, pointer to 2, pointer to 4, pointer to 5) = (0, 3, 1, 2) + - if we do not specify anything - the primary key is used for clustered index +- non clustered indexing - + - we have many other keys - secondary index, composite index, etc - they all use non clustered indexing under the hood + - we can have multiple non clustered indices unlike clustered index + - each non clustered index will use a new b+ tree + - so, while clustered indexing determines how the original b+ tree is constructed, non clustered indexing determines how this additional b+ tree is constructed + - the leaf nodes of this new b+ tree contains pointers to the actual data pages + +## Concurrency Control + +- "critical section" - accessing a shared resource +- e.g. multiple users try to book the same seat, which is seen as free by all of them - and they all try to confirm the same seat +- techniques like using `synchronized` work only for contention among multiple threads of the same process +- so, we need to use "distributed concurrency control" for different processes on potentially different machines +- we have two types of distributed concurrency control - "optimistic concurrency control" and "pessimistic concurrency control" +- "shared locks" - + - shared locks are used for reads + - assume one transaction puts a shared lock on some row + - another transaction can also come in and put a shared lock on this row + - however, another transaction cannot come in and put an exclusive lock on this row - it would have to wait till all the shared locks are removed from this row +- "exclusive locks" - + - exclusive locks are used for writes + - assume one transaction puts a exclusive lock on some row + - another transaction cannot come in - neither with shared nor exclusive lock +- "dirty read problem" - + - both transaction a and transaction b start + - transaction a updates the value of a row to 5 + - transaction b reads this value as 5 + - however, due to some error, transaction a has to rollback its changes back to original value + - so, transaction b reads intermediate, uncommitted data of transaction a +- "non repeatable read" - + - transaction a reads the value of balance as 100 + - transaction b comes in, updates the value to 110 and commits + - when transaction a tries reading the value of the row again, it reads it as 110 + - so, transaction a read different values for the same row during different parts of its transaction +- "phantom read" - + - transaction a sees 500 rows in the database + - transaction b comes in and commits 5 new rows + - transaction a now sees 505 rows in the database + - so, transaction a basically saw different number of rows in the database during different points in the transaction +- isolation level - recall isolation of [acid](#transaction) + +| isolation level | dirty read
possible | non repeatable read
possible | phantom read
possible | +|------------------|--------------------------|-----------------------------------|----------------------------| +| read uncommitted | yes | yes | yes | +| read committed | no | yes | yes | +| repeatable read | no | no | yes | +| serializable | no | no | no | + +- "read uncommitted" - + - no locks are used + - only use when system only involves reads +- "read committed" - + - shared lock is acquired for read but released as soon as read is over + - this explains why we can see committed values by other transactions when we try reading twice + - exclusive lock is acquired for write and kept till the end of the transaction +- "repeatable read" - + - shared lock is acquired for read and kept till the end of the transaction + - exclusive lock is acquired for write and kept till the end of the transaction +- "serializable" - + - works just like repeatable read + - additionally, it puts a "range lock" on the rows that it touches +- typically, we can set the transaction isolation level like so - + ```sql + set transaction isolation level repeatable read; + begin_transaction; + ... + commit transaction; + end_transaction; + ``` +- "optimistic concurrency control" - + - uses isolation level of read committed + - solves concurrency problem using "versions" + - in case of the non repeatable read, transaction a would know that the version has changed (refer example of non repeatable read above) + - advantage - allows much higher levels of concurrency as compared to pessimistic concurrency control + - disadvantage - if we have too many concurrent writes, we would fail at the last step for all of them, thus wasting too many resources +- "pessimistic concurrency control" - + - uses isolation level of repeatable read / serializable + - can have much more deadlock scenarios - + - transaction a and transaction b start off in parallel + - transaction a acquires shared lock on row a + - transaction b acquires shared lock on row b + - transaction a tries to acquire exclusive lock on row b - cannot because of transaction b + - transaction b tries to acquire exclusive lock on row a - cannot because of transaction a + - understand how the exact same scenario discussed above would not have resulted in a deadlock scenario in case of optimistic concurrency control, because it does not hold the shared lock + - database systems are able to detect deadlocks like this and then fail the transactions + +## 2 Phase Locking + +- 2 phase locking is a type of [pessimistic concurrency control](#concurrency-control) +- there are 3 types of 2 phase locking - "basic", "conservative" and "strict" +- "basic 2 phase locking" - + - phase 1 - "growing phase" - transaction can only acquire new locks. the lock manager can either grant or reject this request + - phase 2 - "shrinking phase" - transaction cannot acquire any new locks, only release locks +- basic 2 phase locking has two issues - deadlocks and cascading aborts +- "deadlock" example - the exact one we discussed in [pessimistic concurrency control](#concurrency-control) is a good example +- "cascading aborts" example - + - recall how releasing of locks is done one by one in the shrinking phase of basic 2 phase locking + - so, lets say transaction a releases exclusive lock on a row as part of the shrinking phase + - now, lets say transaction b acquires a shared lock on this row as a part of its growing phase + - now, what if transaction a had to be aborted suddenly due to some error + - now, transaction b has an inconsistent value of the row, and it would have to be aborted as well +- deadlocks can be solved by conservative 2 phase locking and wait for graph +- cascading aborts can be solved by strict 2 phase locking +- cascading aborts are considered very expensive, since they can result in a "chain of cascades" +- note - we want to maintain some degree of concurrency as well, not just consistency, like discussed during optimistic and pessimistic concurrency control +- so, we typically use strict 2 phase locking to resolve cascading aborts and wait for graph for resolving deadlocks + +![2 phase locking](/assets/img/high-level-design/2-phase-locking.png) + +- "wait for graph" - + - the scheduler maintains a graph, where the nodes represent the transactions + - e.g. if transaction a is waiting on a lock to be released which has been acquired by transaction b, then there is an edge from transaction a to transaction b + - once there is a cycle that is detected by the scheduler in the graph, it looks for the "victim" in this graph, and then aborts that transaction + - in choosing the victim, it might make considerations like the amount of effort already put in by this transaction, amount of effort to rollback this transaction, how many cycles would be removed by aborting this transaction, etc +- "conservative 2 phase locking" - + - requires transactions to acquire all locks at the beginning itself + - either the scheduler assigns all the locks to the transaction if possible + - or the transaction will have to wait if one or more of the locks are unavailable + - disadvantages - allows very less concurrency, does not prevent cascading aborts +- "strict 2 phase locking" - + - all the locks are released at once when the transaction is aborted / committed + - disadvantages - allows very less concurrency, does not prevent deadlocks + +## OAuth + +- oauth2 - authentication and authorization standard +- e.g. there is a third party app called tweet analyzer that uses tweet data to show analytics to a front user +- option 1 - we give tweet analyzer our credentials. this option is insecure, since - we share our credentials with a third party app, thus we compromise our credentials. tweet analyzer can now do everything that we as an account owner can do, e.g. create tweets, i.e. there is no restricted access +- option 2 - twitter gives temporary access to tweet analyzer app +- oauth2 is a specification / protocol, which we need to implement +- it has various "grant types" - "authorization code" and "client credentials" are the two most important grant type flows for now +- "resource owner" - the end user i.e. us +- end users own "resources", e.g. *tweets* in our case +- "client" - *tweet analyzer* in our case. it is the third party application trying to get restricted access to "resource" +- "authorization server" - "resource owners" should have an account inside this "authorization server" +- "resource server" - the application which maintains the "resources", e.g. *twitter* in our case +- sometimes "resource server" and "authorization server" can be clubbed into one +- "scopes" - the granular permission that the "client" wants, that "authorization server" gives +- general flow - + - first, the tweet analyzer apps needs to register itself with twitter. this gives them the "client credentials" - the "client id" and the "client secret" + - the resource owners then tries logging in with twitter and not our application + - the resource owners are prompted to provide consent to the client to perform the actions specified via scopes on the resources + - if the resource owners consent to this, the client is provided with an "access token" and a "refresh token" to issue api calls to twitter on behalf of the client +- "authorization code" grant type flow - + - resource owner goes to client + - client redirects to authorization server with - + - client id - helps authorization server identify the client + - scope + - redirect uri - where the authentication server redirects post successful authentication + - response type - "code" - tells that we want authorization code + - state - helps with csrf like attacks + - resource owners enter their credentials here + - assume successful authentication here + - client receives an authorization code + - client goes to authorization server with - + - client id + - client secret - to prove itself + - redirect uri + - grant type - "authorization_code" + - the authorization code + - client gets an access token for the resource owner from the authorization server + - client can then use this access token to make requests to the resource server on behalf of resource owner +- this architecture of first getting an authorization code and then getting the access token helps with better security. the first step helps with verifying the resource owner and the second step with verifying the client +- "implicit" grant type flow (deprecated, removed from oauth 2.1) - + - client receives the access token directly i.e. the entire flow authorization code onwards is skipped + - so, the obvious drawback - the client itself is not verified. anyone can mimic the url where they redirect to authorization server. remember it does not have the client secret, only the client id. and in exchange, they can directly obtain the correct access code +- "password" grant type flow (deprecated, removed from oauth 2.1) - + - resource owner "directly gives" the credentials to the client + - so, the obvious drawback is compromise of credentials +- use case of client credentials grant type flow - no resource owner is involved, so useful for service to service communication. organization a (client) interacts with organization b (resource server and authorization server) +- "client credentials" grant type flow - + - client sends request to authorization server with - + - client id + - client secret + - grant type - "client_credentials" + - scope + - client gets back access token + - client uses this access token to request resource server +- "refresh token" helps avoiding resource owners from initiating entire login flow again after access token expires + - client sends request to resource server with expired access token, hence gets a 401 + - client sends request to authorization server with - + - client id + - client secret + - grant type - "refresh_token" + - refresh token + - client receives back a fresh access and refresh token + - the client can use this new access token now to make requests to the resource server +- refresh tokens expiry - + - refresh tokens do not typically have an expiration, but can have one + - also, refresh tokens can be "rolling" i.e. they are single use and should be replaced with the new refresh token received every time a request for a fresh access token is made +- how can a resource server verify the access token provided by the client? - three options - + - api interaction between authorization server and resource server. drawback - an additional api call from resource server to authorization server every time + - both authorization server and resource server can have access to the same shared storage. drawback - shared storage + - recommended - when the resource server boots up, it gets a public certificate from the authorization server. this public certificate is used to validate if the access token has been tampered. also called "jwk endpoint" +- oidc - openid connect - oauth helped with authorization. by adding openid on top of it, we can use it for authentication as well +- a specific scope called "openid" is added to the list of scopes to get the identity details of the resource owner +- this way, we additionally get an id token along with access and refresh tokens +- the id token is in the form of jwt +- unlike access token, id token contains things like user name, email, etc - this is what helps with authentication +- so, two things are being done by our resource server - + - it is verifying the access token using the certificate + - it is parsing the token to get user roles, and this is possible because the token is in jwt format - recall how payload and header are just base64 encoded +- we send the token using `Authorization: Bearer <>` +- authorization code grant type flow by itself would only work when we use jsp, thymeleaf, etc i.e. server side templating languages +- however, we cannot hide the client secret in spa applications, since the entire source code is accessible from the browser +- so, we use pkce - proof key for code exchange +- so, the client generates + - "code verifier" - a random cryptic string + - "code challenge" - base64(sha256(code verifier)) +- the ui first when asking for the authorization code in the "authorization code" grant type flow sends the code challenge +- bts, the authorization server stores this code challenge, and returns the authorization code +- the ui then sends a request for an access token. this request unlike in the regular "authorization code" grant type flow which includes the client secret, includes the code verifier +- the authorization server then compares this code verifier with the code challenge which it had stored +- if the values match, the authorization server returns the right tokens +- so my understanding - "authorization code" grant type flow is almost same as "pkce", except + - the first request from the client includes the code challenge + - the second request from the client does not include the client secret, but includes the code verifier +- so, with a mitm kind of attack - if someone gets access to authorization code, it is not enough, they need the code verifier as well, and they cannot predict code verifier from the code challenge, since it is encrypted using sha256 +- so, in oauth2.1, they have started clubbing authorization code + pkce grant types together +- my understanding - if someone gains access to our client id, why cant they self generate the code verifier and code challenge and ask for a new access token? they can, but the redirect uri might help us here by redirecting to our own website! (note - there is some component of specifying valid redirect uris when registering clients with the authorization server) +- now is there not a second issue above - redirecting to a legitimate app from an illegitimate app? - solved by the "state" parameter, which helped us with csrf attacks + +## Encryption + +- converts our "plain text" data into a "cipher text" +- "symmetric encryption" - + - same "key" is used for encryption and decryption + - two popular algorithms - aes and des (des is no longer recommended, has been cracked) + - advantage - symmetric encryption consumes much less resources / is much faster compared to asymmetric encryption + - disadvantages - + - secure distribution of key is difficult + - managing different keys for every clients - note that we cannot use the same key for different clients, otherwise they can intercept each other's responses +- "asymmetric decryption" - + - use two keys - public and private + - "public key" - used by sender to encrypt + - "private key" - used by receiver to decrypt + - note - the above is not always true - my understanding of an example - in "digital signatures" in [oauth](#oauth), the resource server downloads public key from authorization server to validate tokens - aka public key is being used to decrypt? + - two popular algorithms - rsa, diffie hellman + - advantage - public key can be distributed easily to clients, while private keys are retained by servers + - disadvantage - very computationally expensive +- final note - do not bring concepts of sha etc here - they are hashing, not encryption - encoding != hashing != encryption diff --git a/_posts/2024-05-27-java-multithreading.md b/_posts/2024-05-27-java-multithreading.md new file mode 100644 index 0000000..f033aed --- /dev/null +++ b/_posts/2024-05-27-java-multithreading.md @@ -0,0 +1,1034 @@ +--- +title: Java Multithreading +--- + +## Concepts + +- there are two benefits of multithreading - **responsiveness** and **performance** +- _repeat - remember multithreading gives both the features above_ +- **concurrency** means performing different tasks on the same core. instead of waiting for one task to entirely complete first, we perform both simultaneously in a time-shared manner. it **increases responsiveness** +- **concurrency** is also called **multi tasking**. remember - we do not even need different cores for this +- **parallelism** means performing different tasks on different cores. it **increases performance** +- **throughput** is the number of tasks completed per unit time +- **latency** is the time taken per unit task +- how are the two different + - for optimizing throughput, since the tasks themselves are different, they just need to be scheduled on different threads in parallel, and that automatically increases the throughput. therefore, fewer considerations exist + - for optimizing latency, we would probably break a single task into smaller subtasks. considerations - + - what parts of the original task can be performed in parallel and which parts have to be done sequentially + - how to aggregate the smaller chunks of results into the final result +- in case of multithreading, components like **heaps get shared across the threads**, while components like **stack and instruction pointer are scoped to a single thread** +- what is stored inside the stack - + - local primitive types, e.g. if we declare an `int a = 1` inside a method + - **primitive formal parameters** - similar to above - `int add(int a, int b)` + - references created inside functions are stored in stack +- what is stored inside the heap - + - all the objects (not references) are stored in heap + - members of a class - **primitive values** and **non primitive references** - should be stored in heap - note how these when declared inside functions are stored in stacks as discussed above +- ideally this makes sense - remember, each thread is executing a different instruction, so each of them needs its own instruction pointer etc +- a frame is created for every method call - this way, when a method gets over, it is popped off from the stack - last in first out - main method gets popped of last from the stack +- a lot of frames in the stack can result with a stack overflow exception if we end up with too many frames +- heap belongs to a process, and all threads can write to / read from the heap at any given time +- all objects are stored in the heap till there is a reference to them, after which they get garbage collected by the garbage collector +- note - we can write `System.gc()`, which hints the jvm to run this garbage collector +- strength of java is this automatic memory management, which we do not have to worry about +- when we execute a program, it becomes a process i.e. it gets loaded into the memory from the disk and a thread is used to execute it +- there are often way more processes being executed than cores in a cpu. so, using **context switching**, one thread at a time gets cpu and, gets paused and another thread is scheduled on the cpu +- context switching has overhead, and doing a **lot of it can lead to** something called **thrashing** +- however, context switching between the threads of the same process is much cheaper than context switching between the threads of different processes, since a lot of components like heaps are reused +- when the operating system has to chose between scheduling multiple tasks on a thread, and if for e.g. it schedules a computationally expensive task first, it can lead to the **starvation** of other smaller tasks +- so, to combat issues like this, there are various algorithms used by the operating system to calculate the priority of a task +- we can also programmatically provide a priority **which gets used in the calculation above** +- a thing that struck me - when writing applications, do not base your conclusions off the computer you are running your code on, base it off how it would work on the server +- number of threads = number of cores is the best way to start, since context switching as discussed earlier consumes resources +- however, it is only optimal if the threads are always performing some computation, and never in blocked state. if the threads perform some io, then a thread performing some computation can take its place +- also, modern day computers use **hyper threading** i.e. the same physical core is divided into multiple virtual cores. this means that a core can run more than one thread in modern cpus + +## Thread Creation + +- we create an instance of `Thread` and to it, we pass an object of a class that implements `Runnable`. its `run` method needs to be overridden. all of this can be replaced by a lambda java 8 onwards + ```java + Thread thread = new Thread(() -> System.out.println("i am inside " + Thread.currentThread().getName())); + thread.start(); + ``` +- if instead of using `Runnable`, we extend the `Thread` class, we get access to a lot of internal methods +- when we run `Thread.sleep`, we instruct the os to not schedule that thread until the timeout is over +- note misconception - invoking this method does not consume any cpu i.e. it is not like a while loop that waits for 5 seconds +- we can set a name of a thread to make it helpful when debugging, using `thread.setName()` +- we can set a priority between 1 and 10 using `thread.setPriority` +- we can use `thread.setUncaughtExceptionHandler` to catch "unchecked exceptions" that might have occurred during the execution of the thread, and thus cleanup resources +- we can shut down the application entirely from any thread using `System.exit(0)` + +## Thread Coordination + +- the application will not terminate until all threads stop +- but, we might want to interrupt a thread so that the thread can maybe understand that the application wants to terminate, and accordingly handle cleaning up of resources + ```java + Thread thread = new Thread(new Task()); + thread.start(); + thread.interrupt(); + ``` +- the interruption can be handled gracefully in two ways as described below + - if our code throws an interrupted exception, calling `interrupt` will trigger it, and then we can handle it. other examples where this exception happens are for calls like `thread.join()` and `object.wait()` + ```java + public class Task implements Runnable { + + @Override + public void run() { + try { + Thread.sleep(20000); + } catch (InterruptedException e) { + System.out.println("[inside catch] i was interrupted..."); + } + } + } + ``` + - else we can check the property `isInterrupted` and handle it accordingly + ```java + public class Task implements Runnable { + + @Override + public void run() { + Date date = new Date(); + while ((new Date()).getTime() - date.getTime() < 10000) { + if (Thread.currentThread().isInterrupted()) { + System.out.println("[inside loop] i was interrupted..."); + break; + } + } + } + } + ``` +- **background / daemon threads** there might be a case when what the thread does need not be handled gracefully, and it is just an overhead for us to check for e.g. the `isInterrupted` continually. so, we can set the daemon property of the thread to true. this way when the thread is interrupted, it will be terminated without us having to handle it + ```java + Thread thread = new Thread(new Task()); + thread.setDaemon(true); + thread.start(); + thread.interrupt(); + ``` +- also, unlike normal threads, where the application does not close if any thread is running, a daemon thread does not prevent the application from terminating +- if we implement `Callable` instead of `Runnable`, we can also throw an `InterruptedException` when for e.g. we see that `isInterrupted` is evaluated to true. this means the parent thread calling this thread will know that it was interrupted in an adhoc manner +- threads execute independent of each other. but what if thread b depends on the results of thread a? +- **busy wait** - one way could be we run a loop in thread b to monitor the status of thread a (assume thread a sets a boolean to true). this means thread b is also using resources, which is not ideal +- so, we can instead call `threadA.join()` from thread b, thread b goes into waiting state till thread a completes +- we should also consider calling the join with a timeout, e.g. `threadA.join(t)` +- my understanding - if for e.g. the main thread runs the below. first, we start threads t1 and t2 in parallel of the main thread. now, we block the main thread by calling `t1.join()`. the main thread will be stopped till t1 completes + ```java + t1.start(); t2.start(); + t1.join(); t2.join(); + ``` +- scenario 1 - t1 completes before t2, the main thread resumes, and again will be stopped till t2 completes +- scenario 2 - t1 completes after t2. the main thread resumes and will not wait for t2 since it has already completed + +## Thread Pooling + +- **thread pooling** - reusing threads instead of recreating them every time +- tasks are added to a **queue**, and the threads pick them up as and when they become free +- so, when tasks are cpu intensive, we should have number of threads closer to core size, and when tasks are io intensive, we should have higher number of threads, but remember that - + - too many threads can cause performance issues as well due to context switching + - threads are not trivial to create, they are resource intensive +- java provides 4 kinds of thread pools - `FixedThreadPool`, `CachedThreadPool`, `ScheduledThreadPool` and `SingleThreadedExecutor` +- **fixed thread pool executor** - polls for tasks stored in a queue. there can be many tasks, but a set number of threads which get reused. the queue should be thread safe i.e. blocking + ```java + int numberOfProcessors = Runtime.getRuntime().availableProcessors(); + ExecutorService executorService = Executors.newFixedThreadPool(numberOfProcessors); + + executorService.execute(new Runnable() {...}); + ``` +- **cached thread pool executor** - it looks at its threads to see if any of them are free, and if it is able to find one, it will schedule this task on the free thread. else, it will spawn a new thread. too many threads is not too big of a problem, thanks to the keep alive timeout discussed later. however, expect **out of memory exceptions** if too many tasks are added to the executor, because threads are resource intensive + ```java + ExecutorService executorService = Executors.newCachedThreadPool(); + ``` +- to remember - threads occupy a lot of space in main memory, hence can cause out of memory exceptions if not controlled properly +- **scheduled thread pool executor** - it used a delay queue, so that the tasks get picked up by the threads after the specified delay or schedule. this means tasks might have to be reordered, which is done by the queue itself. `schedule` can help trigger the task after a certain delay, `scheduleAtFixedRate` can help trigger it like a cron at regular intervals while `scheduleAtFixedDelay` can help schedule the next task a fixed time period after the previous task was completed + ```java + ScheduledExecutorService executorService = Executors.newScheduledThreadPool(5); + executorService.schedule( + () -> System.out.println("hi from " + Thread.currentThread().getName()), + 5, + TimeUnit.SECONDS + ); + ``` +- **single thread pool executor** - like fixed thread pool executor with size of pool as one. the advantage is for e.g. all the tasks will be run in order of creation +- all thread pool executors create new threads if the previous thread is killed for some reason +- there are a variety of parameters that can be added to the executors +- **core pool size** - minimum number of threads that are always kept in the pool +- **max pool size** - maximum number of threads that can be present in the thread pool. it has value `INTEGER.MAX_VALUE` by default for cached and scheduled thread pool executor, while the same value as core pool size for fixed and single thread pool executor +- **keep alive timeout** - the time till an idle thread is kept in the pool, after which it is removed. keep alive is only applicable to cached and scheduled thread pool executors, since in fixed and single thread pool executors, the number of threads do not change +- note that keep alive timeout does not change the core pool threads. this behavior can however be changed using `allowCoreThreadTimeOut` +- **queue** - the different types of executors use different queues based on their requirements. the queues also need to be thread safe + - e.g. a fixed and single thread pool executor has a fixed number of threads, so there can potentially be infinite number of tasks that get queued up, because of which it uses a `LinkedBlockingQueue` + - cached thread pool spawns number of threads equal to the number of tasks, so it uses a `SynchronousQueue`, which only needs to hold one task + - scheduled thread pool uses `DelayedWorkQueue` so that the tasks are returned from the queue only if the condition of cron etc. is met +- **rejection handler** - assume all threads are occupied and the queue is full. in this case, the thread pool will reject the task that it gets. how it rejects the task is determined using the rejection policy. the different rejection policies are - + - **abort** - submitting the new task throws `RejectedExecutionException`, which is a runtime exception + - **discard** - silently discard the incoming task + - **discard oldest** - discard the oldest task from the queue to add this new task to the queue + - **caller runs** - requests the caller thread itself to run this task +- till now, to obtain an instance of `ExecutorService`, we were using static methods on `Executors`. we can also use `new ThreadPoolExecutor()` and then pass our own core pool size, queue, etc. configuration parameters as the constructor arguments +- we need to shut down the executor in a clean way. we can initiate it using `executorService.shutdown()`. this will throw the `RejectedExecutionException` for any new tasks that are submitted to it, but at the same time will complete both all the currently executing tasks and queued up tasks +- if we run `shutdownNow`, it will return `List` for the queued up tasks and clear the queue, but complete all the currently executing tasks +- `awaitTermination(timeout)` will terminate the tasks if they are not completed by the specified time +- we also have helper methods like `isShutdown()` and `isTerminated()` +- if a task wants to return a value, we use `Callable` instead of `Runnable` +- however, the `execute` method on `ExecutorService` only works if we implement `Runnable` interface. if we implement `Callable` interface, we have to use `submit` +- the return value of `Callable` is wrapped around a `Future`. `future.get()` is a blocking call i.e. the thread calling it will not move ahead until the future resolves. so, we can also use `future.get(timeout)` + ```java + ExecutorService executorService = Executors.newFixedThreadPool(1); + + Future result = executorService.submit(() -> { + Thread.sleep(4000); + return (new Random()).nextInt(); + }); + + Thread.sleep(3000); + // this simulates that we were able to perform 3 seconds worth of operations + // in the main thread while the task thread was performing its blocking stuff + + System.out.println("result = " + result.get()); + ``` +- we can cancel the task using `future.cancel(false)`. this means that the thread pool will remove the task from the queue. the false means that if a thread is already running the task, it will not do anything. had we passed true, it would have tried to interrupt the task +- we also have helper methods like `future.isDone()` and `future.isCancelled()` +- suppose we have a list of items, and for each item, we want to perform a series of processing + ```java + Future package$ = executorService.submit(() -> pack(order)); + Future delivery$ = executorService.submit(() -> deliver(package$.get())); + Future email$ = executorService.submit(() -> sendEmail(delivery$.get())); + ``` + notice how the calling thread is blocked by all `get` of future. instead, we could use - + ```java + CompletableFuture.supplyAsync(() -> pack(order)) + .thenApply((package) -> deliver(package)) + .thenApply((delivery) -> sendEmail(delivery)) + // ... + ``` +- in the above case, we have specified a series of steps to run one after another and since we do not care about the results in our main thread, the assigning of tasks to threads is managed by java itself. the main thread is not paused by the get calls. notice how we also do not need to specify any executor +- if we use `thenApplyAsync` instead of `thenApply`, a different thread can be used to execute the next operation instead of the previous one +- internally, `CompletableFuture` uses fork join pool, but we can specify a custom executor as well, e.g. `thenApplyAsync(fn, executor)` + +## Race Condition + +- **race condition** - happens where **resource is shared** across multiple threads + ```java + public class SharedResourceProblem { + + public static void main(String[] args) throws Exception { + + Integer count = 10000000; + Counter counter = new Counter(); + + Thread a = new Thread(() -> { + for (int i = 0; i < count; i++) { + counter.increment(); + } + }); + + Thread b = new Thread(() -> { + for (int i = 0; i < count; i++) { + counter.decrement(); + } + }); + + a.start(); b.start(); + a.join(); b.join(); + + System.out.println("shared resource value = " + counter.getCount()); + // shared resource value = 15 + } + } + + class Counter { + + private int count = 0; + + public void increment() { + count += 1; + } + + public void decrement() { + count -= 1; + } + + public int getCount() { + return count; + } + } + ``` +- the `resource += 1` and `resource -= 1` operations are not atomic, it comprises of three individual operations - + - getting the original value + - incrementing it by one + - setting the new value +- solutions - identify critical sections and use locks, make operations atomic, etc + +## Synchronized + +- we can wrap our code blocks with a **critical section**, which makes them atomic. this way, only one thread can access that block of code at a time, and any other thread trying to access it during this will be suspended till the critical section is freed +- say we use `synchronized` on multiple methods of a class +- once a thread invokes one of the synchronized method of this class, no other thread can invoke any other synchronized method of this class. this is because **using synchronized on a method is applied on the instance (object) of the method** +- the object referred to above is called a **monitor**. only one thread can acquire a monitor at a time +- method one - prefix method signature with synchronized (refer the counter example earlier. the shared resource print would now print 0) + ```java + public synchronized void increment() { + // ... + } + ``` +- another method is to use synchronized blocks + ```java + synchronized (object) { + // ... + } + ``` +- using blocks, the code is much more flexible since we can have different critical sections locked on different monitors +- if using synchronized on methods, two different methods of the same class cannot be executed in parallel - the monitor there is the instance itself +- however, when using synchronized blocks, we can do as follows inside different methods of the same class - + ```java + Object lock1 = new Object(); + Object lock2 = new Object(); + + // ... + + synchronized(lock1) { + // ... + } + + synchronized(lock2) { + // ... + } + ``` +- note - reduce critical section size for better performance + +## Atomic Operations + +- so, **assignment to references and primitive values in java are atomic** + - `this.name = name` inside for e.g. a constructor is atomic + - `int a = 8` is atomic +- however, an **exception** in this is assignment to longs and doubles. since it is 64 bit, it happens in 2 operations - one assignment for the lower 32 bit and another one for the upper 32 bit +- the solution is to declare them with **volatile**, e.g. `volatile double a = 1.2` +- using volatile makes operations on longs and doubles atomic +- also, java has a lot of atomic classes under `java.util.concurrent.atomic` as well +- remember - when we use volatile, we make assignment atomic, not operations like `a++` atomic +- my doubt probably cleared - then what is the use for e.g. `AtomicReference`, if assignment to reference is already an atomic operation? we can do as follows (a metric example discussed later) + ```java + AtomicReference state$ = new AtomicReference<>(); + state$.set(initialValue); + + State currentState = state$.get(); + State newSate = computeNewState(); + Boolean isUpdateSuccess = state$.compareAndSet(currentState, newState); + ``` + +## Data Race + +- remember - race condition and data race are two different problems +- **data race** - when the order of operations on variables do not match the sequential code we write. this happens mostly because there are optimizations like prefetching, vectorization, rearranging of instructions, etc + ```java + class Pair { + + private int a = 0; + private int b = 0; + + public void increment() { + a++; + b++; + } + + public void check() { + if (b > a) { + System.out.println("well that doesn't seem right..."); + } + } + } + ``` + calling the class - + ```java + Pair pair = new Pair(); + + Thread t1 = new Thread(() -> { while (true) pair.increment(); }); + Thread t2 = new Thread(() -> { while (true) pair.check(); }); + + t1.start(); t2.start(); + t1.join(); t2.join(); + ``` +- our expectation is that since b is read before a and a is incremented before b, there is no way even with a race condition that b can be bigger than a. however, due to data race, we do hit the print statement +- data race is also where we can use `volatile`. **volatile guarantees the order of instructions being executed** + ```java + private volatile int a = 0; + private volatile int b = 0; + ``` +- this is called the **visibility problem** +- basically, the two threads have their own **local cache**, but also have a **shared cache**. they write the value to the local cache, but this does not + - either update the shared cache + - or the second thread's local cache does not refresh its value from the shared cache +- **however, when we use volatile, it refreshes / synchronizes both the shared cache and the local cache of all threads** +- basically, code before access to a volatile variable gets executed before it, and code after the access to a volatile variable after. this is called the happens before relationship +- while we could have just used synchronized for both the methods above, realize the advantage of using volatile over synchronized. with synchronization, we lose out on the multithreading, since our functions would have been invoked one at a time. in this case, the two methods are still being invoked concurrently +- if we have n cores, for each core we have a register. then we have an associated l1 cache on top of each register. l2 cache can be shared across multiple cores, and finally we have only one l3 cache and ram
+ ![multithreading](/assets/img/java/multithreading.drawio.png) +- **java memory model** - it is an enforcement that jvm implementations have to follow so that java programs have similar behavior everywhere, and the different optimizations of instructions, cache, etc. do not affect the functioning of the program + +## Locking Strategies and Deadlocks + +- **coarse-grained locking** - meaning we use one lock for everything, just like having synchronized on all methods, not performant. its counterpart is **fine-grained locking** +- coarse grained locking example - make all methods of the class synchronized +- cons with fine-grained locking - we can run into deadlocks more often +- conditions for a deadlock - + - **mutual exclusion** - only one thread can hold the resource at a time + - **hold and wait** - the thread acquires the resource and is waiting for another resource to be freed up + - **non-preemptive** - the resource is released only when the thread is done using it and another thread cannot acquire it forcefully + - **circular wait** - a cyclic dependency is formed where threads wait for resources acquired by each other +- one way to prevent deadlocks is to acquire locks in our code in the same order. this need not be considered when releasing the locks +- another way can be to use techniques like `tryLock`, `lockInterruptibly`, etc (discussed later) +- reentrant lock - instead of having a synchronized block, we use this reentrant lock + ```java + Lock lock = new ReentrantLock(); + ``` +- unlike synchronized where the block signals the start and end of the critical section, locking and unlocking happens explicitly in case of reentrant locks +- to avoid deadlocks caused by for e.g. the method throwing exceptions, we should use it in the following way - + ```java + lock.lock(); + try { + // critical section + } finally { + lock.unlock(); + } + ``` +- it provides a lot of methods for more advanced use cases like `getOwner`, `getQueuedThreads`, `isHeldByCurrentThread`, `isLocked`, etc +- the name `Reentrant` comes from the fact that the lock can be acquired by the thread multiple times, which means it would have to free it multiple times as well, e.g. think about recursive calls. we can get the number of times it was acquired using `getHoldCount` +- another benefit of using reentrant locks is **fairness** - e.g. what if a thread repeatedly acquires the lock, leading to the starving of other threads? we can prevent this by instantiating it using `new ReentrantLock(true)` +- note that introducing fairness also has some overhead associated with it, thus impacting performance +- if we do not set to true, what we get is a **barge in lock** i.e. suppose there are three threads waiting for the lock in a queue. when the thread originally with the lock releases it, if a new thread not in the queue comes up to acquire the lock, it gets the lock and the threads in the queue continue to stay there. however, if we had set the fairness to true, the thread with the longest waiting time gets it first +- so, two problems - "fairness" and "barge in lock" are solved by reentrant lock +- if the lock is not available, the thread of course goes into the suspended state till it is able to acquire the lock +- we can use `lockInterruptibly` - this way, another thread can for e.g. call `this_thread.interrupt()`, and an interrupted exception is thrown. this "unblocks" the thread to help it proceed further. had we just used lock, the wait would have been indefinite + ```java + try { + lock.lockInterruptibly(); + } catch (InterruptedException e) { + // cleanup and exit + } + ``` +- similar to above, we also have the `tryLock` method, which returns a boolean that indicates whether a lock was successfully acquired. it also accepts timeout as a parameter, what that does is self-explanatory +- this can help, for e.g. in realtime applications to provide feedback continuously without pausing the application entirely + ```java + while (true) { + if (lock.tryLock()) { + try { + // critical section + } finally { + lock.unlock(); + } + } else { + // some logic + } + // some logic + } + ``` +- so, we saw how reentrant lock, which while works like synchronized keyword, has additional capabilities like telling current owner and locking using different strategies like `lockInterruptibly` and `tryLock` +- when locking till now, we used mutual exclusion to its fullest. but, we can be a bit more flexible when the shared resource is just being read from and not written to +- multiple readers can access a resource concurrently but multiple writers or one writer with multiple readers cannot +- this is why we have `ReentrantReadWriteLock` + ```java + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + Lock readLock = lock.readLock(); + Lock writeLock = lock.writeLock(); + ``` +- fairness in `ReentrantReadWriteLock` works the same way as `ReentrantLock`, except that if the thread waiting for the longest time was a reader, all reader threads in the queue are freed up to read +- of course, base decisions off of type of workloads - if workload is read intensive, read write lock is better, otherwise we might be better off using the normal reentrant lock itself + +## Inter Thread Communication + +- **semaphore** - it helps restrict number of users to a resource +- remember - locks only allow one user per resource, but semaphores allow multiple users to acquire a resource +- so, we can call a lock a semaphore with one resource + ```java + Semaphore semaphore = new Semaphore(number_of_permits); + ``` +- when we call `semaphore.acquire()` to acquire a **permit**, and the number of permits reduces by one. if no permits are available at the moment, the thread is blocked till a resource in the semaphore is released +- similarly, we have `semaphore.release()` +- optionally, i think both `acquire` and `release` accept n, the number as an argument which can help acquire / release more than one permit +- another major difference from locks - there is **no notion of owning thread** in semaphores unlike in locks - e.g. a semaphore acquired by thread a can be released by thread b. so, thread a can acquire it again without having ever released it +- this reason also makes semaphores are a great choice for producer consumer problems. producer consumer problem using semaphores - + - we need a lock so that multiple threads cannot touch the queue at one go + - we start with the full semaphore being empty and the empty semaphore being full, since there are no items initially + - look how we use semaphore's philosophy to our advantage - consumer threads acquire full semaphore while producer threads release it + - my understanding of why we need two semaphores - e.g. if we only had full semaphore - producer releases it and consumer acquires it - how would we have "stopped" the producer from producing when the rate of production > rate of consumption? its almost like that the two semaphores help with **back pressure** as well + ```java + Integer CAPACITY = 50; + Semaphore empty = new Semaphore(CAPACITY); + Semaphore full = new Semaphore(0); + Queue queue = new ArrayDeque<>(CAPACITY); + Lock lock = new ReentrantLock(); + ``` +- producer code - + ```java + while (true) { + empty.acquire(); + Item item = produce(); + lock.lock(); + try { + queue.add(item); + } finally { + lock.unlock(); + } + full.release(); + } + ``` +- consumer code - + ```java + while (true) { + full.acquire(); + Item item; + lock.lock(); + try { + item = queue.poll(); + } finally { + lock.unlock(); + } + consume(item); + empty.release(); + } + ``` +- some different inter thread communication techniques we saw till now - + - calling `interrupt` from one thread on another thread. this is then further used in techniques like `lockInterruptibly` + - calling `join` for a thread to wait for another thread to complete its job + - using `acquire` and `release` on semaphore +- **conditions** flow - + - one thread **checks a condition**, and goes to sleep if the condition is not met + - a second thread can "mutate the state" and **signal** the first thread to check its condition again + - if the condition is met, the thread proceeds, else it can go back to sleep + - note - conditions come with a lock, so that the "state" being modified can be wrapped with a critical section + - note - when we call `await` on the condition, it also releases the lock before going to sleep, so that the second thread described in the flow above can acquire the lock to mutate the state. so, even though the thread which was waiting gets signaled to wake up, it also needs to be able to acquire the lock again, i.e. the other threads modifying state need to release the lock + - placing the condition inside the while loop helps so that even if signalled, it will again start waiting if the condition is not met yet + - first thread - + ```java + ReentrantLock lock = new ReentrantLock(); + Condition condition = lock.newCondition(); + + lock.lock(); + try { + while (condition x is not met) { + condition.await(); + } + } finally { + lock.unlock(); + } + ``` + - second thread - + ```java + lock.lock(); + try { + // modify variables used in condition x... + condition.signal(); + // despite signalling, thread one does not wake up, we need to unlock the lock first + } finally { + lock.unlock(); + } + ``` +- conditions also have advanced methods like - + - `await(timeout)` - just like locks have timeouts to prevent indefinite waiting + - `signalAll` - using `signal`, only one of all the threads waiting on the condition wake up, `signalAll` wakes all of them up +- the class Object, and therefore all objects have methods `wait`, `notify` and `notifyAll` +- therefore, without using any special classes - + - simulate **conditions** using `wait`, `notify` and `notifyAll` + - simulate **locks** using `synchronized` +- note - recall how when using conditions we were wrapping it via locks. we need to do the same thing here i.e. wrap using synchronized block in order to be able to call notify + - first thread - + ```java + synchronized (this) { + while (condition x is not met) { + wait(); + } + } + ``` + - second thread. my understanding - but needs to happen on same object and inside different method - + ```java + synchronized(this) { + // modify variables used in condition x... + notify(); + } + ``` +- when we call `wait` on an object, the thread it was called on continues to be in waiting state until another thread calls `notify` on that object +- `notify` will wake up any random thread that was sleeping, and to wake up all threads we can use `notifyAll` +- note, important, my understanding - the order of operations should not matter i.e. calling `notify` vs changing of state - because everything is inside a critical section, inside the same monitor +- if we think about it, the `lock.lock()` and `lock.unlock()` are the starting and ending of `synchronize` blocks respectively, `condition.await()` is like `wait()` and `condition.signal()` like `notify()` +- introducing locks can make our code more error-prone, more subject to deadlocks etc. however, it makes the code more flexible, e.g. unlike synchronized blocks which have to exist within a single method, locks can be acquired and freed from different methods +- using locks result in issues like deadlocks if coded improperly +- our main objective is to execute instructions as a single hardware operation +- we can achieve this by using Atomic classes provided by java + ```java + AtomicInteger count = new AtomicInteger(initialValue); + count.incrementAndGet(); + ``` +- recall how we had discussed that `a = a + 1` actually consisted of three atomic operations, which has all been condensed down into one using these java helper classes +- so, recall the counter example in shared resource earlier, and how we had solved it using synchronized. we can now get rid of the `synchronized` and implement it as follows - + ```java + public void increment() { + count.incrementAndGet(); + } + ``` +- the disadvantage of using these classes is of course that only each operation by itself is atomic, a series of such calls together is not atomic, so it may be good only for simpler use cases +- a lot of operations use `compareAndSet` underneath, and we have access to it to. it sets the value to the new value if the current value matches the expected value. otherwise, the old value is retained. it also returns a boolean which is true if the current value matches the expected value + ```java + count.compareAndSet(expectedValue, newValue); + ``` +- `AtomicReference` can be used for any object type to get and set values in a thread safe i.e. atomic way, and we can use methods like compareAndSet on it +- e.g. notice how below, the synchronized keyword is not used for addSample, but we still have a thread safe implementation by using `compareAndSet`. note how and why we use a loop - if the old value stays the same before and after calculating the new value, then update using the new value, else recalculate using the new value using the "new old value" + ```java + class Metric { + + int count = 0; + + int sum = 0; + } + + class MetricAtomic { + + AtomicReference metric$ = new AtomicReference<>(new Metric()); + + public void addSample(int sample) { + Metric currentMetric; + Metric newMetric; + do { + currentMetric = metric$.get(); + newMetric = new Metric(); + newMetric.count = currentMetric.count + 1; + newMetric.sum = currentMetric.sum + sample; + } while (!metric$.compareAndSet(currentMetric, newMetric)); + } + } + ``` +- we often have a lot of tasks but not so many threads. some objects are not thread safe i.e. cannot be used by multiple threads. however, they can be used by multiple tasks being executed on the same thread. coding this ourselves can be tough, which is why we have `ThreadLocal`, which basically returns a new instance for every thread, and reuses that instance when a thread asks for that instance again + ```java + public static ThreadLocal car = ThreadLocal.withInitial(() -> new Car()); + ``` +- spring uses the concept of this via `ContextHolder`s in for instance, `RequestContextHolder`, `TransactionContextHolder`, `SecurityContextHolder`, etc. my understanding - since spring follows one thread per-request model, this way, any of the services, classes, etc. that need access to information can get it easily. it is like setting and sharing state for a request + +## High Performance IO + +- what is **blocking io** - when cpu is idle, e.g. when reading from database etc +- such **io bound tasks** block the thread till they return the result +- io bound tasks are very common in web applications etc +- how it works internally -
+ ![io bound](/assets/img/java/io-bound-architecture.drawio.png) + - the controllers like network cards return the response to the dma (direct memory access) + - the dma writes it to the memory + - the dma notifies the cpu that the response is available + - the cpu can now access the memory for variables +- so, during this entire duration, the thread that was processing the request that involved the io task (and thus reaching out to the controller) was sitting idle and thus **was blocked** +- this is why number of threads = number of cores does not give us the best performance when we have more io bound instead of cpu intensive tasks +- this is why we have a "thread per request model" in spring mvc, which i believe caps at 200 threads to prevent out of memory errors etc +- it has caveats like - + - creating and managing threads are expensive - recall how it has its own stack etc + - number of context switching increases, which too is an expensive operation - recall **thrashing** + - assume that there are two kinds of calls a web server supports - one that makes a call to an external service and one that calls the database. assume the external service has a performance bug, which makes the first call very slow. this way, if we had for e.g. 150 requests for first call and 150 for the second call (assume 200 is the default thread pool size in embedded tomcat), the 150 instances of the second call would start to be affected because of the 150 instances of the first call now +- so, the newer model used by for e.g. spring web flux is **asynchronous** and **non blocking** +- the thread is no longer blocked waiting for the response - a callback is provided which is called once the request is resolved +- so now, we can go back to the **thread per core** model - which is much more optimal +- there can be problems like **callback hell** etc, which is solved by using libraries like project reactor for reactive style of programming, which is more declarative to write + +## Virtual Threads + +- till now, the `Thread` class we saw was actually a wrapper around an actual os thread +- these are also called **platform threads** - since they map one to one with os threads +- **virtual threads** - they are not directly related to os threads. they are managed by the jvm itself +- this makes them much less resource intensive +- the jvm manages a pool of platform threads, and schedules the virtual threads on these platform threads one by one +- once a virtual thread is **mounted** on a platform thread, it is called a **carrier thread** +- if a virtual thread cannot progress, it is **unmounted** from the platform thread and the platform thread starts tracking a new virtual thread +- this way, the number of platform threads stay small in number and are influenced by the number of cores +- there is no context switching overhead just like in reactive programming - what we are saving on here - frequent normal (hence platform hence os threads) context switching is replaced by frequent virtual thread context switching +- creation techniques - + ```java + Runnable runnable = () -> System.out.println("from thread: " + Thread.currentThread()); + + new Thread(runnable).start(); // platform thread (implicit) + // from thread: Thread[#19,Thread-0,5,main] + + Thread.ofPlatform().unstarted(runnable).start(); // platform thread (explicit) + // from thread: Thread[#20,Thread-1,5,main] + + Thread.ofVirtual().unstarted(runnable).start(); // platform thread + // from thread: VirtualThread[#21]/runnable@ForkJoinPool-1-worker-1 + ``` +- note - virtual threads are only useful when we have blocking io calls, not when we have cpu intensive operations +- this happens because unlike the usual model where our thread had to sit idle for the blocking call, the platform thread never stops here and is always working, it is the virtual thread that is sitting idle, and hence we optimize our cpu usage because we are using our platform threads optimally +- so, developers still write the usual blocking code, which simplifies coding, as compared to say reactive programming +- underneath, the blocking calls have been refactored for us to make use of virtual threads so that the platform threads are not sitting idle +- e.g. cached thread pools replacement is **new virtual thread per task executor** - we do not have to create pools of fixed size - we use a thread per task model and all the complexity is now managed by jvm for us bts +- when we are using normal threads for blocking calls e.g. using jpa, the thread cannot be used. what we can do is use context switching to utilize the cpu better. however, this model meant we needed a lot of platform threads, and managing them, context switching between them, etc has a lot of overhead, which is why maybe embedded tomcat for instance had a cap of about 200 threads. now with virtual threads, there is no cap needed, so it can be used via cached thread pool executor equivalent, but here there would never be any out of memory etc issues like in cached thread pool executor, since virtual threads are very lightweight +- some notes - + - virtual threads are always daemon, and making them non daemon will throw an exception + - virtual threads do not have a concept of priority + +## Miscellaneous Notes + +- io bound threads are prioritized more than computation based threads + - since most of the time of ios threads is spent in waiting state + - and most of the time of cpu bound threads is spent in computation + - maybe this is related to concepts of starvation etc somehow +- why context switching is expensive - the entire state of the thread has to be saved in memory - all the stack, instruction pointer, local variables inside the method, etc +- thread yield - helps hint to the scheduler that the current thread wants to give up its processor + - can be used by computationally expensive threads to hint the scheduler that they want to give up the processor for another thread +- the priority we set manually only serves as a hint - the os can choose to accept / ignore it +- `thread.start()` is not the same as `thread.run()` - `thread.run()` simply runs the runnable we pass to it inside the calling thread + ```java + public static void main(String [] args) { + + Thread thread = new Thread(() -> + System.out.println("Hello from " + Thread.currentThread().getName())); + thread.setName("New Thread"); + thread.run(); // Hello from main + } + ``` +- Thread.State - an enum, with the following states - + - NEW - created but not yet started + - RUNNABLE - thread is available for execution / already executing on some processor + - BLOCKED - blocked for a monitor / lock + - WAITING - a thread goes into this state after we call `object.wait()` or `some_thread.join()` - so, the idea is that the thread now waits for some other thread's action? + - a thread can also go "out" of this state after we call `object.notify()` from elsewhere to wake this thread up + - TIMED_WAITING - same as above but with timeouts? threads on calling `Thread.sleep` also go to this state + - TERMINATED - after thread has finished execution +- when we override the `start` method, we need to call `super.start()` +- when we say `Thread.sleep(x)`, first the thread goes into timed_waiting state. after the time when it does go into runnable state, there is no guarantee that the thread will iimediately be scheduled on a core - a core might be occupied by some other thread +- an `IIllegalMonitorStateException` is thrown if we try to call `await` / `signal` on a Condition without locking the `lock` first + +## Example - Rate Limiting Using Token Bucket Filter + +- a bucket gets filled at the rate of 1 token per second +- the bucket has a capacity of n +- there can be multiple consumers - when they ask for a thread, they should get a token - they will be stalled till a token is available +- producer code - + ```java + @SneakyThrows + private void produce() { + + while (true) { + + synchronized (this) { + + if (tokens < capacity) { + tokens += 1; + } + + notifyAll(); + } + + Thread.sleep(1000); + } + } + + void startProducing() { + + Thread producerThread = new Thread(this::produce); + + producerThread.setDaemon(true); + producerThread.start(); + } + ``` +- consumer code - + ```java + @SneakyThrows + void consume() { + + synchronized (this) { + + while (tokens == 0) { + wait(); + } + + tokens -= 1; + } + } + ``` +- final total bucket code - + ```java + static class Bucket { + + int tokens; + + int capacity; + + public Bucket(int capacity) { + this.capacity = capacity; + tokens = 0; + } + + private void produce() { ... } + private void startProducing() { ... } + private void consume() { ... } + } + ``` + +### A Good Test + +- notice how since we start after 7 seconds, the first 5 consumer threads get their token instantly +- while the remaining three threads take 1 second each +- output - + ``` + 1716809831> thread 0 consumed successfully + 1716809831> thread 1 consumed successfully + 1716809831> thread 2 consumed successfully + 1716809831> thread 3 consumed successfully + 1716809831> thread 5 consumed successfully + 1716809832> thread 4 consumed successfully + 1716809833> thread 7 consumed successfully + 1716809834> thread 6 consumed successfully + ``` + +```java +Bucket bucket = new Bucket(5); +bucket.startProducing(); +Thread.sleep(7); + +List threads = new ArrayList<>(); + +for (int i = 0; i < 8; i++) { + + Thread t = new Thread(() -> { + bucket.consume(); + System.out.printf("%s> %s consumed successfully\n", + System.currentTimeMillis() / 1000, Thread.currentThread().getName()); + }); + t.setName("thread " + i); + + threads.add(t); +} + +Thread.sleep(5000); + +threads.forEach(Thread::start); + +for (Thread t : threads) { + t.join(); +} +``` + +## Example - Implementing a Semaphore + +- java does have a semaphore, but we initialize it with initial permits +- there is no limit as such to the maximum permits in java's semaphore +- implement a semaphore which is initialized with maximum allowed permits, and is also initialized with the same number of permits +- acquire - + ```java + @SneakyThrows + synchronized void acquire() { + + while (availablePermits == 0) { + wait(); + } + + Thread.sleep(1000); + + availablePermits -= 1; + notify(); + } + ``` +- release - + ```java + @SneakyThrows + synchronized void release() { + + while (availablePermits == maxPermits) { + wait(); + } + + Thread.sleep(1); + + availablePermits += 1; + notify(); + } + ``` +- actual semaphore - + ```java + static class Semaphore { + + private final int maxPermits; + + private int availablePermits; + + public Semaphore(int maxPermits) { + this.maxPermits = maxPermits; + this.availablePermits = maxPermits; + } + + synchronized void acquire() { ... } + + synchronized void release() { ... } + } + ``` +- tips for testing - + - initialize using 1 + - make the thread calling acquire slow + - make the thread calling release fast + - show release would not be called until acquire is called + +## Example - Implementing a Read Write Lock + +- acquiring read - + ```java + @SneakyThrows + synchronized void acquireRead() { + + while (isWriteAcquired) { + wait(); + } + + readers += 1; + } + ``` +- acquiring write - + ```java + @SneakyThrows + synchronized void acquireWrite() { + + while (isWriteAcquired || readers != 0) { + wait(); + } + + isWriteAcquired = true; + } + ``` +- releasing read - my thought - just call `notify` to wake up just 1 writer + ```java + synchronized void releaseRead() { + readers -= 1; + notify(); + } + ``` +- releasing write - my thought - call `notifyAll` to wake up all writers + ```java + synchronized void releaseWrite() { + isWriteAcquired = false; + notifyAll(); + } + ``` + +## Example - Dining Philosophers + +- five philosophers - either eat or think +- they share five forks between them +- they need two forks to eat - so at a time, only two philosophers can eat + +![dining philosophers](/assets/img/java-multithreading/dining-philosophers.png) + +- remember - we can easily end up in a deadlock - assume all philosophers acquire the fork on their left, and now all of them will wait for the fork on their right. two solutions are + - only four philosophers at a time try acquiring a fork. this way, at least one philosopher will always be able to acquire two forks and it solves the problem + - all the philosophers but one try acquiring the left fork first, and then the right fork. one of them tries acquiring the right fork first. note that the order in which forks are released does not matter +- tip - do not insert sleeps - we will see a deadlock quickly + +### Table + +```java +static class Table { + + private final Semaphore[] forks; + + Table(int size) { + + forks = new Semaphore[size]; + + for (int i = 0; i < size; i++) { + forks[i] = new Semaphore(1); + } + } + + @SneakyThrows + private void acquire(int philosopherId) { + forks[philosopherId].acquire(); + forks[(philosopherId + 1) % forks.length].acquire(); + } + + private void release(int philosopherId) { + forks[philosopherId].release(); + forks[(philosopherId + 1) % forks.length].release(); + } +} +``` + +### Philosopher + +```java +static class Philosopher { + + private final Table table; + + private final Integer id; + + public Philosopher(Table table, Integer id) { + this.table = table; + this.id = id; + } + + void start() { + while (true) { + contemplate(); + eat(); + } + } + + @SneakyThrows + private void contemplate() { + System.out.printf("%d thinking...\n", id); + } + + @SneakyThrows + private void eat() { + table.acquire(id); + System.out.printf("%d eating...\n", id); + table.release(id); + } +} +``` + +### Main Method + +```java +int size = 5; + +Table table = new Table(size); + +List threads = new ArrayList<>(); + +for (int i = 0; i < size; i++) { + Philosopher philosopher = new Philosopher(table, i); + Thread thread = new Thread(philosopher::start); + threads.add(thread); +} + +for (Thread thread : threads) { + thread.start(); +} + +for (Thread thread : threads) { + thread.join(); +} +``` + +### Solution 1 + +```java +// inside constructor +eatingPhilosophers = new Semaphore(size - 1); + +@SneakyThrows +private void acquire(int philosopherId) { + eatingPhilosophers.acquire(); + forks[philosopherId].acquire(); + forks[(philosopherId + 1) % forks.length].acquire(); + eatingPhilosophers.release(); +} +``` + +### Solution 2 + +```java +@SneakyThrows +private void acquire(int philosopherId) { + if (philosopherId == 0) { + forks[(philosopherId + 1) % forks.length].acquire(); + forks[philosopherId].acquire(); + } else { + forks[philosopherId].acquire(); + forks[(philosopherId + 1) % forks.length].acquire(); + } +} +``` diff --git a/_posts/2024-06-29-general.md b/_posts/2024-06-29-general.md new file mode 100644 index 0000000..23eef27 --- /dev/null +++ b/_posts/2024-06-29-general.md @@ -0,0 +1,84 @@ +--- +title: General +--- + +## Resistance Training + +- "resistance training" - helps build muscles +- "weight training" = using free weights and machines +- "body weight training" = using body weight +- body weight training advantages - no special equipment is needed. also, less prone to injuries +- body weight training disadvantages - progressive overload is hard to achieve +- weight training advantages - helps build muscle quicker +- "progressive overload" - creating an environment which forces the body to adapt +- for progressive overload, increasing weights is very very important. this is the best way to build muscles +- formula to build muscles = training + nutrition + rest +- "hypertrophy" - process of building muscle - we damage the muscle fibres, and our body grows back stronger and more muscle fibres +- "compound movements" - work several muscles at once. they are the most effective +- for now, compound movements are much better for me than "isolation movements" i.e. movements that target a specific muscle + +## Workout Strategy + +- workouts are split into two categories +- they can be done for 4 days in a week, alternating between each other +- how much break to take - + - 30 seconds between warmup sets + - 1.5 min between sets + - 3 mins between exercises +- remember - we increase reps for "endurance" and increase weights for "strength" +- increasing weight is the right technique for hypertrophy +- "warmup" - lowers risk of injury and activates muscles before exercise +- we do 2 warmup sets per exercise - empty barbell, and then 0.5 times of first actual set + +## Full Body Workout 1 Routine + +- bench press - 2 warmup sets, 3 actual sets, 8 reps +- barbell row - 2 warmup sets, 3 actual sets, 8 reps +- barbell squats - 2 warmup sets, 3 actual sets, 8 reps +- bicep curls - 2 warmup sets, 3 actual sets, 10 reps + +## Full Body Workout 2 Routine + +- pull ups - 2 warmup sets, 3 actual sets, 8 reps +- deadlift - 2 warmup sets, 3 actual sets, 8 reps +- military press - 2 warmup sets, 3 actual sets, 8 reps +- tricep push down - 2 warmup sets, 3 actual sets, 10 reps + +## Flexibility + +- "flexibility" - moving a joint through full range of its motion +- static stretching != warmup - warmup is done using light weights +- mistake 1 - a stretch should be never be done beyond 60 seconds +- mistake 2 - do not hold your breath when stretching +- "static stretching" - bringing muscles to their peak position and holding them there for 15-30 seconds. should be done post-workout +- "dynamic stretching" - moving through a range of motion repeatedly. should be done pre-workout + +## Dynamic Stretching Routine - Lower Body + +- jumping jacks - 30 seconds +- walking lunges - 10 reps per leg +- knee tucks - 10 reps per leg +- leg swings - 10 reps per leg +- side lunges with a twist - 10 reps per leg + +## Dynamic Stretching Routine - Upper Body + +- normal arm circles - 10 reps +- normal arm circles (reverse direction) - 10 reps +- large arm circles - 10 reps +- large arm circles (reverse direction) - 10 reps +- pushups - for now, as many as possible in one set + +## Static Stretching + +- seated back twist +- quad stretch +- hip abductor stretch +- hamstring stretch +- gluteal stretch +- lat stretch +- calf stretch +- lying abdominal stretch +- neck side bend +- shoulder stretch +- tricep stretch diff --git a/_posts/2024-07-05-data-engineering.md b/_posts/2024-07-05-data-engineering.md new file mode 100644 index 0000000..d0d3df0 --- /dev/null +++ b/_posts/2024-07-05-data-engineering.md @@ -0,0 +1,403 @@ +--- +title: Data Engineering +--- + +## Different Architectures + +- oltp - + - online transactional processing + - used for operational data keeping + - we do not maintain a history of the data - we update records in place to reflect the current state of the system +- olap - + - online analytical processing + - used for analytical decision making + - it contains historical data as well, not just the current data + - millions of records are analyzed at a time, so we need fast query processing +- data warehouse - + - "centralized location" for all external otlp data sources - we basically copy the data from these different sources into a single place + - two reasons for using data warehouses - + - "optimized" for analytic processing + - should also be "user friendly" for decision makers + - must load data consistently and repeatedly using "etl" + - note - ensuring "non volatility" is important during etl - our data warehouse should be consistent between the etl refreshes that we do on a scheduled basis - warehouse can go into an inconsistent state "during" an etl refresh, ensure that does not happen +- data lake - + - was needed to accommodate for three vs - volume, velocity, variety + - because of this, technology used by data lake is big data, while warehouses use databases + - used when the use case is not properly defined yet. data can be taken out of lake + - used for storing raw data. data is not processed unlike in warehouse +- data virtualization - + - in data warehousing, we duplicate the data and transform it to optimize it + - in data virtualization, we access it from its original source itself + - we access data from different sources "in the same way", agnostic of the underlying database + - so, this technique is useful when we are okay with the larger response times, require minimum transformations on the raw data, etc +- etl - + - extract, transform, load + - "extract" - bring the raw data as is into the staging layer + - "transform" - perform transformations to create a dimensional model of the data + - "load" - load into the core / user access / warehouse layer + - it is typically done in batches +- ods - + - operational data storage + - used for "operational decision making" and not analytical decision making + - there is a need for "realtime" unlike in data warehouses, where we do batch processing + - the architecture we are talking about here is that from the actual sources, we have two separate things spawning off - the ods and the the actual warehouse + - another architecture - we can treat ods as the staging layer of the warehouse, and then perform batch "tl" on this staging layer for constructing the warehouse layer +- elt - + - problem with etl - a lot of data modelling and understanding of business use case is needed before storing the actual data + - instead, we flip the order of transform and load + - we "blast" the data into a "big data environment" in their raw form + - then, we use the compute of the warehouse / big data to perform transformations + - elt allows for much more flexible transformations as compared to etl, since in etl, we perform transformations to load data into the core layer, while the transformations move towards the client in elt + - with traditional data warehouses, we still use etl, but when we use the newer paradigms like data lakes etc, we start utilizing elt patterns +- we can build on prem or cloud, both have their own considerations. e.g. egress data from cloud is usually much more expensive than ingress data into cloud + +## Technologies in Warehouses + +- relational databases - uses popular sql for querying, primary keys and foreign keys, etc - relational databases are also a great starting point for storing our warehouses +- cubes - + - data is stored in a multidimensional array + - e.g. imagine we want sales for a particular customer and product, the value will already be aggregated across remaining dimensions like time + - it uses the mdx language for querying + - so, optimizations like pre computations, indexing, etc underneath make cubes very fast for complex queries + - extra "cost of compute" to maintain these complex structures + - so, imagine the recomputation in aggregations needed in case of updates + - earlier, cubes were an alternative to dimensional modelling and building warehouses using technologies like rdbms + - now, cubes can be seen as a specialized technology for the data mart layer +- in memory databases - for high query performance, used in for e.g. data marts. in traditional databases, data is stored in hdd / ssd in disk and loaded into memory for querying. this is eliminated here to increase performance. challenge - lack of durability, resolve via snapshots / images to help restore to a specific point. e.g. sap hana, amazon memory db, etc +- columnar storage - traditionally, queries are processed row wise. if we use columnar storage, data is stored column wise. so, if we need to process only a small subset of columns, we do not have to process entire rows and go through all the data like in traditional relational databases +- massive parallel processing - a task can be broken down into multiple subtasks. this way, these subtasks can run in parallel, thus optimizing performance +- in mpp above, we talked about breaking down compute. for scaling storage, the underlying architecture can be - + - shared disk architecture - underlying storage is one i.e. multiple computes run on top of the same underlying storage + - shared nothing architecture - underlying storage is also broken down i.e. each compute will have its own storage + +## Warehouse Architecture + +- we put the data "as is" into the staging layer of the warehouse +- we can do some minute transformations like renaming and adjusting positions of columns when performing a union between employee datasets from different sources +- we put the data into the core / user access / warehouse layer by performing transformations from the staging layer +- why do we need a staging layer i.e. why not load into the warehouse layers directly from the sources - + - we risk burdening our oltp systems + - also, data from sources can be in different formats like crm, files like xml, etc. we get all the data from these different sources into for e.g. a relational db, which helps us use sql / a standard way to perform transformations, e.g. perform joins easily which would not be possible otherwise when data comes from different sources +- staging layer can be temporary (more common) or permanent +- temporary - after the data is loaded into the warehouse layer, we truncate the staging layer. this way, the next time we want to load data into the warehouse layer, we need to perform a diff - we need to know the last inserted row inside the warehouse layer, and calculate the new rows inside the sources since then, and accordingly add the new data to the warehouse. this diff checking can be done on columns like timestamp if maintained +- persistent staging layer - maybe easier since existing warehouse layer can be recreated / new warehouse layers can be created easily +- we can either consume from the core layer directly, or have data marts on top. some use cases of having data marts - + - core layer has a lot of tables - we can have data marts on top of this. now, users will only have access to tables which they need + - core layer isn't burdened with queries from all consumers, and consumers only consume capacity of their data marts + - allow us to have different types of databases, e.g. in memory vs cubes based on use case + - other concerns like security are separated as well +- on a higher level, there are two options - "centralized" and "component based" +- "centralized" - one warehouse layer, and consumers consume from this layer. advantage - "one stop shopping" for all our consumers. here, we have two options - + - "edw" (enterprise data warehouses) - typical relational databases, columnar storages, etc + - "data lake" - big data technologies like hadoop, s3, etc +- "component based" - each consumer has its own data mart. advantage - allows for mix and match of technology based on use case. here, we have two options - + - "dependent data marts" - this is the variation we have seen - data marts are built on top of the warehouse layer + - "independent data marts" - in this, we skip the warehouse layer. we make data marts directly consume from the sources and perform transformations. so, it is like we have small warehouse layers per consumer +- general rule - we should strive towards "centralized" instead of "component based" +- initial load - + - first extraction + - it is much slower and puts load on sources, so it is best if done during weekends etc. so that the systems are not impacted + - we can use it for fault tolerance later on as well, in case the warehouse ever gets corrupted + - note - we are talking about a warehouse. this does not include only data of "current state", but "historical data" as well. recall that typically, only the current state of the system stored in the otlp databases, so we might need to support the audit trail of data somehow +- delta / incremental load - + - incrementally loading (new / modified / deleted) data + - uses fields like timestamp (created / modified date) etc + - note - for processing deletes, we use delete markers instead of actually deleting the data + - we run it on a schedule regularly instead of just once + - we also run it on a subset of data by filtering out and retaining only the new / modified data +- incremental load patterns - + - "append" - append new information + - "in place updates" - we make changes to existing data when applying updates + - "complete replacement" - we overwrite the entire data completely, even if only a small subset of it changes + - "rolling append" - say we only maintain a window of lets say 2 years of data. so, when we receive a sales event, we expire all the data that is more than 2 years before it +- typically, we use append for fact tables, and in place updates for scd 1 / append for scd 2. so, the incremental load patterns' decision is scoped to a "table", not an entire dimensional model + + ![warehouse architecture](/assets/img/warehouse-and-snowflake/data-warehouse.png) + +- transformation examples - + - deduplication - e.g. two campuses somehow have duplicate information about the same faculty, because this faculty takes classes in both campuses. we might want to deduplicate this faculty information when creating our dimensional models + - filtering out rows / columns + - data value unification - e.g. when performing a union, one source system contains m and f, while another contains male / female + - key generation - surrogate key + - joining + - splitting - e.g. if natural keys are composed of the division and department codes, split them into individual columns + - aggregations / groupings + - deriving columns - precalculate profit using sales, costs, discounts + +## Dimensional Modelling + +- "dimensional modelling" - it is a method of organizing data, which helps with two major concerns + - "usability" - business users should be able to understand it easily - several instances have shown how business users immediately understand this + - "performance" - querying it should be very fast. this method of organizing data helps with optimizations underneath, e.g. our queries would typically first filter out the rows in the dimension tables, and then perform joins with the fact table +- note - dimensions modelling is organizing tables into facts and dimensions. but, based on use case, we can also rather organize data as flat tables in our warehouse, e.g. join and group data to produce the final aggregated form directly, instead of expecting our end users to perform aggregations +- facts - represent a single measurement, e.g. an order line +- dimensions - "give context" to the measurements e.g. product category +- important - dimensions help in "grouping" / "filtering" facts for our use case. grouping and filtering is key here - we need to be able to derive this ourselves +- e.g. if our fact is student loan, and we want to derive insights "for" engineering major "by" department, we need to use where clause for major and group by department +- identifying facts - + - are measurable - we can perform some calculations on them + - facts can also mark events sometimes, which is why it is accompanied by a date +- grain - the most atomic level of a fact table - what does a row in the fact table actually represent. keep it fine level for flexibility, this way we can do aggregations for summaries +- we usually have multiple dimensions clustered around the fact, thus it is called a star schema - the structure looks like that of a star, with the fact table at the center and the dimension tables at the tips of the star spiking out of the fact table +- dimension tables have more attributes / are much wider when compared to fact tables +- however, fact tables have much more records than dimension tables +- maybe thats why we use delta load for fact tables and full load in dimension tables sometimes +- indexes - make reads faster, writes slower. useful when we join, filter by, etc. on the data based on that column +- b tree index - this is the default / most common type of index. we can break data into a multi level tree. helpful when column has high cardinality
+ ![b tree](/assets/img/warehouse-and-snowflake/b-tree.drawio.png) +- bitmap index - useful when the column has a low cardinality. the position of bit corresponds to the row number, and if its 1, it means that row has that value + + | pk | payment type | + | -- | ------------ | + | 1 | visa | + | 2 | mastercard | + | 3 | mastercard | + | 4 | visa | + + | payment type | bitmap | + | ------------ | ------ | + | visa | 1001 | + | mastercard | 0110 | + +- fact table indexing example - using b tree index on surrogate key and bitmap index on the dimension column foreign keys +- tips for fact tables - + - avoid bloating the fact table for increased performance. even if it is one to one, move the contextual data into a separate dimension table + - my understanding - i think the above point is even more important, considering facts are never updated, only dimensions are updated using [slowly changing dimensions](#slowly-changing-dimensions). so, we cannot store context in fact tables, since it cannot be updated +- tips for dimension tables - + - try replacing cryptic abbreviations with real words to decrease reliance on lookups etc + - sometimes, ids have a special embedded meaning, e.g. first three letters represent the department. extract these into separate columns beforehand instead of having business users query on these attributes + - denormalization or data redundancy might be used in dimension tables to help with query performance, e.g. the column category in the product dimension table discussed above, notice the repeating category snacks. data warehouses are not maintained using 3nf models, if we do so, we are creating a snowflake schema + - index dimension tables properly - since operations like filtering etc are performed on dimension tables, we should index them properly +- snowflake schema - + - a star schema is a snowflake schema with one level + - e.g. the product dimension table will be a join between the product and category tables, thus making it denormalized + - if the denormalization above is actually degrading our performance / affecting our consistency, we can instead have multiple levels e.g. the category column can be extracted into its own dimension table in the product dimension table + - so, when we have multiple levels of dimension tables like this, it is also called a snowflake schema + - note it can result in multiple joins, thus degrading performance, but consume lesser storage at the same time +- surrogate keys - + - use auto generated integer ids / a key management service instead of natural keys + - they are more performant compared to natural keys which are usually strings + - we can also use -1 for dummy dimensions discussed below to make our intent clearer + - we can still retain the natural keys, but the warehouse and its users use the surrogate keys to perform joins etc +- exception to above - date dimensions - foreign key / surrogate key in date dimension does not have to be meaningless auto incremented integers. instead of an auto incremented integer, represent the actual date time as an integer, e.g. 050820232232 (first 8 characters represent date, remaining 4 time) +- so, the final architecture (in terms of keys) looks like this - + - dimension tables have surrogate keys as their primary key + - however, the dimension tables can still retain the natural key columns + - the fact tables use the composite key comprising of all the foreign surrogate keys as their primary key + - however, the fact tables can still retain the natural key columns +- it is common to keep pre calculated aggregates in the fact table, e.g. instead of expecting users to perform addition / subtraction to get the profit earned per order line item, just maintain it as a separate column in the fact table which can be used. this way users do not perform erroneous calculations +- date dimension - pre calculated values like day of week, month name, etc. since date dimension is very predictable, we can pre-populate the date dimension table for the next 10 years or so in advance. we can also consider populating all variations, e.g. month full name, month abbreviation, month as integer in case of date dimension + +## Additivity in Facts + +- note - additivity includes adding only, not average etc +- "additive facts" - can be added across all dimensions. e.g. adding units sold across the date dimension tells us the number of units sold for a particular product, adding the units sold across the product dimension tells us the number of units sold at a particular date. note - i get confused in terminology. across product means group by date + + | product_id | units sold | date | price | + | ---------- | ---------- | -------- | ----- | + | 1 | 2 | 06082023 | 23 | + | 2 | 2 | 06082023 | 19 | + | 1 | 5 | 10082023 | 11 | + +- "semi-additive facts" - only added across some dimensions. e.g. imagine a fact table where the grain is our balance on a particular date for a particular folio + + | portfolio_id | date | balance | + | ------------ | -------- | ------- | + | 1 | 06082023 | 100 | + | 2 | 06082023 | 50 | + | 1 | 10082023 | 110 | + +- adding balance across the date dimension does not make sense, since balance is a cumulative number + + | portfolio_id | balance | + | ------------ | ------- | + | 1 | 210 | + | 2 | 50 | + +- but adding it across portfolios tells us our total balance on a particular date + + | date | balance | + | -------- | ------- | + | 06082023 | 150 | + | 10082023 | 110 | + +- "non-additive facts" - cannot be added across any dimension. e.g. price of a product. if products make up our fact table, there is no meaning of summing the prices of different products +- nulls for facts in fact tables - usually in tools, average ignores null, sums will treat nulls as a 0, etc. but sometimes, we might want to replace nulls with 0, it depends on our use case +- nulls for foreign keys in fact tables - nulls can result in problems, therefore introduce a row in the dimension table with a dummy value, and have the rows with null as foreign key in the fact table point to this dummy value's primary key instead + +## Types of Fact Tables + +### Transactional Fact Table + +- a grain indicates one event / transaction +- e.g. one row represents one meal payment - + - dimensions are date, the counter and the student + - the fact is the actual amount +- the fact table will look as follows (recall how primary key of a fact table is a composite key constructed using the surrogate foreign keys for dimension tables) - + - student_id (pk, fk) + - counter_id (pk, fk) + - date_id (pk, fk) + - amount (double) +- characteristic - will have "many dimensions" (foreign keys) +- disadvantage - "grow very rapidly" in size, and queries on such facts often require aggregations +- we can store two or more facts in this fact table, if the "two rules" below are met - + - facts occur at the same grain + - facts occur simultaneously +- e.g. tuition bill and tuition payment - + - they occur at the same grain - both have similar dimensions - student, date, etc + - but, they occur at different times, since they are different business processes + - so, they cannot be stored in the same transactional fact table +- e.g. tuition bill, hostel bill and co curricular activities bill - + - they can be stored in the same fact table as they satisfy both rules + - so, the fact table will have 3 different facts for the 3 amounts +- assume tuition and co curricular activities had a campus component in them, which the hostel did not + - facts do not occur at the same grain - hostel does not have the same dimensions as the other two + - so, we cannot store the three facts in the same fact table + +### Periodic Snapshot Fact Table + +- a grain is a "summary of transactions" +- sometimes, it is possible to do this using the transaction fact table, but for e.g. the sql can get complex +- so, we can instead answer specific business questions using periodic snapshot fact tables +- e.g. analyze end of week balances for a customer. fact table - + - student_id (pk, fk) + - week_id (pk, fk) + - balance +- characteristic - because of its nature, it "will not have many dimensions", since a row is an aggregation across some dimension(s) +- advantage - they "grow slower" compared to the transactional fact table +- typically, they are "semi additive" - because the fact here is a summarized value, we cannot add it across all dimensions as easily as a transactional fact + +### Accumulation Snapshot Fact Table + +- one grain summarizes the "progress of a business process" defined through different stages +- e.g. one row represents an order, which has stages for production, packaging, shipping, delivery, etc +- characteristic - it "has many date dimensions" +- so, date is also an example of "a role playing dimension" for us +- this too should grow slower in size as compared to the transactional fact table + +### Factless Fact Table + +- sometimes, there is no measurable fact +- e.g. a fact table where a new record for every new employee that is registered +- it will have dimensions like department id, employee id, position id, date, etc, but no measurable fact +- we can perform aggregations to answer questions like - + - number of employees who joined last month + - number of employees in a department +- another technique - "tracking fact" + - we store a boolean value - can be set to 1 or 0 + - e.g. the fact table represents a student "registering" for a webinar + - but, we store 1 or 0 depending on whether or not the student actually attends the webinar + - now, we can perform sum aggregation to get the number of students who actually attend the webinar + +## Types of Dimensions + +- "conformed dimensions" - dimensions shared across multiple facts, e.g. date dimension. advantages - + - helps combine the facts using the shared dimension. e.g. if we have two different facts for sales and profits, we can combine them using the date dimension. this helps us compare the cost and sales side by side + - we can reuse the same dimension table across different dimensional models - this way, we save on the etl / storage costs of maintaining duplicate dimension tables +- "degenerate dimension" - e.g. we have a sales fact table, with a foreign (surrogate) key for the category dimension. the category dimension only has two columns - the surrogate key and the category name. so, we can instead directly store the category name in the sales fact table. this usually occurs in the transactional fact table +- "junk dimensions" - e.g. imagine we have a lot of indicators that are eating up a lot of width (therefore space) of the fact table, thus impacting its performance. so, we can instead extract these dimensions to its own table. note - the "cardinality" of these dimensions should be "low" +- typically, we store all the possible values in the junk dimension. but, the number of rows in this junk dimension can grow a lot, e.g. m, n, p, q values for 4 columns respectively will mean a total of m * n * p * q combinations. so, we can - + - store the dimensions "as we come across them" in this junk dimension instead of storing all combinations in a precomputed fashion + - "split" the junk dimensions i.e. group related junk dimensions together. advantage - now, one junk dimension table will have m * n values, another will have p * q values + + | amount | payment_method | incoming / outgoing | + | ------ | -------------- | ------------------- | + | 23 | credit card | incoming | + | 12 | cash | outgoing | + + | amount | flag | + | ------ | ---- | + | 23 | 1 | + | 12 | 3 | + + | pk | payment_method | incoming / outgoing | + | -- | -------------- | ------------------- | + | 1 | credit card | incoming | + | 3 | cash | outgoing | + +- "role playing dimension" - + - same dimension table is referenced in the fact table multiple times + - e.g. date dimension for order date vs shipping date. note - this is also an example of [accumulation snapshot fact table](#accumulation-snapshot-fact-table) + - an additional optimization for bi - use views - data is not duplicated, but users see different dimension tables for the different "roles" the date dimension might be playing, thus increasing readability + +## Slowly Changing Dimensions + +- we have different techniques to manage changes in dimensions +- my realization - data in fact tables is never updated, it has insert only operations. this means it makes even more sense to move out data that is not "measurement related" from the fact table + +### Type 0 + +- only retain the original data +- useful when our dimensions do not change. e.g. date dimension + +### Type 1 + +- we overwrite the old data in the same row with the new values for its columns +- so, no history is retained +- e.g. a new category for biscuits was introduced. so, the category changes from snacks to biscuits +- issue - this can suddenly kind of maybe show reduced sales for dashboards which were monitoring the category for snacks + +### Type 2 + +- add new rows for the new versions, instead of updating existing rows as in [type 1](#type-1) +- so basically, now we have a new row with a new primary surrogate key in our dimension table +- disadvantage - most complex to implement +- advantage - most accurate representation of history +- e.g. old financial aids will point to the older student dimension row, and newer financial aid will point to the newer student dimension row + + | financial_aid_key | student_key | amount | + |-------------------|-------------|--------| + | 1 | 2 | 500 | + | 1 | 3 | 200 | + | 1 | 4 | 750 | + + | key | name | level | + |-----|---------------|------------------| + | 2 | john doe | higher secondary | + | 3 | michael smith | higher secondary | + | 4 | john doe | college | + +- issue 1 - how can we get all financial aids for a student? my understanding - we will have to use the natural key stored in the dimension table, student_id in this case. e.g. we can fetch all the keys corresponding to the student_id for which we need the financial aids, then we can join it with the fact table for all the financial aids for this student + + | key | name | level | student_id | + |-----|---------------|------------------|------------| + | 2 | john doe | higher secondary | 87555 | + | 3 | michael smith | higher secondary | 54568 | + | 4 | john doe | college | 87555 | + +- issue 2 - we do not have a way of telling for e.g. the current categories in our system, since we now have multiple rows + - option 1 - we can have a boolean attribute is obsolete, which will be marked 1 for all the newer versions, and 0 for all the older versions. issue - we cannot predict the "order of updates", in case there are 4-5 updates for the same student + - option 2 - introduce an effective and an expiry date. it helps maintain the ordering as well + - option 3 - use both. this is the recommended approach by kimball + + | key | name | level | student_id | effective_date | expiry_date | is_active | + |-----|---------------|------------------|------------|----------------|-------------|-----------| + | 2 | john doe | higher secondary | 87555 | 06082023 | 08082023 | 0 | + | 3 | michael smith | higher secondary | 54568 | 06082023 | 31129999 | 1 | + | 4 | john doe | college | 87555 | 08082023 | 31129999 | 1 | + +### Type 3 + +- we introduce a new columns for the different versions, as opposed to inserting new rows as in [type 2](#type-2) +- advantage - allows to switch back and forth between different versions easily +- so we will have two columns - previous category and current category in the category dimension table +- it is not for unpredictable / frequent changes, we use type 2 for that +- this is for a more structured change like a reorganization at a company +- this way, we can lay out a clear demise plan in place with maintaining backwards compatibility + +## ETL Design + +- our etl design is guided by our decisions of - + - type of [slowly changing dimensions](#slowly-changing-dimensions) we use + - how we do cdc (change data capture) i.e. recall the [kind of incremental loads](#warehouse-architecture) we can use + - [type of fact table](#types-of-fact-tables) in our dimensional model +- there are different ways to do cdc - + - we can use the timestamp of the transaction. we can use the "watermark" for this - we keep track of the last etl run's timestamp. based on the value of the watermark, we need to look for new / updated data in the sources + - we can also use the "database logs" directly - debezium i believe uses this +- we should also have some kind of "parallel processing" for more efficiency. however, we have to process dimension table changes before processing the fact table changes. even the dimension table changes need to be processed in order of dependency if we have a snowflake schema instead of a star schema +- for dimension tables - + - new data - we can look for natural keys in the source not present in our dimension table. then, we can insert them by additionally adding a surrogate key to them + - type 1 change example - "update all rows". why all rows - if a type 2 change for a different column occurs before the type 1 change for this column, we will have to update all the rows with the same natural key with the new value. why - when analyzing, it might happen that the older version of the dimension is relevant - we would still want the latest value for this column in the older version + - type 2 change example - we add a new surrogate key. the natural key stays the same however. we can update the effective / expiry date columns accordingly as well +- for fact tables - + - we need to get "the right row" from the dimension table, because there can be multiple rows for the same natural key due to type 2 scd. after this, we can add the new fact, if present, using the surrogate key of this latest dimension diff --git a/_posts/2024-07-16-python-basics.md b/_posts/2024-07-16-python-basics.md new file mode 100644 index 0000000..72f7678 --- /dev/null +++ b/_posts/2024-07-16-python-basics.md @@ -0,0 +1,2052 @@ +--- +title: Python Basics +--- + +## Getting Started + +- python3 vs python2 - python3 is **not** backwards compatible - a lot of big changes were made +- it addresses a lot of issues with python2, and the world has moved to python3 now +- checking the version - `python3 --version` +- entering the interactive shell in python - type `python3` and hit enter +- this is are also call **repl** - read, evaluate, print loop +- this is why, when we enter 43 + 1, it will read, evaluate and finally print the result. then, the prompt comes up again +- exiting the interactive shell - ctrl + D +- run a python file using `python3 file_name.py` + +## Numbers & Operators + +- three data types of numbers in python - int, floats and complex numbers - + ```py + print(type(1)) # + print(type(1.1)) # + ``` +- type coercion - an operation between an int and a float returns a float +- division returns a float, unlike other programming languages + ```py + print(1 / 2) # 0.5 + ``` +- for exponents (or roots by raising to fractions) - + ```py + print(2 ** 3) # 8 + print(8 ** (1 / 3)) # 2.0 + ``` +- modulo - gives the remainder + ```py + print(8 % 3) # 2 + ``` +- integer division - + ```py + print(10 // 3) # 3 + ``` + +## Variables and Data Types + +- **variables** - store some data and pull it out later +- some reasons why we need variables + - data can dynamic and need not be static - e.g. when we pull the data from database + - even for static data, it makes code more readable - e.g. imagine using a variable pi vs the actual number itself in code +- points about assignment of variables - + - variables can be reassigned + - variables can be assigned to other variables +- variable naming restrictions - + - start with letter or underscore + - rest of it can be letters, numbers, underscores +- variable naming conventions - + - snake case is preferred + - lowercase is preferred (use uppercase for constants) + - typically, variables starting or ending with two underscores (called dunder) indicate that it is used for something internal and should not be fiddled with +- some **data types** - boolean, [number related data types](#numbers--operators), strings, lists, dictionary, etc +- **dynamic typing** - variables can change data types. most other languages are **statically typed** + ```py + variable = True + print(type(variable)) # + + variable = "shameek" + print(type(variable)) # + ``` +- **None** - used to represent nothingness - equivalent of null in other languages + ```py + nothing = None + print(type(nothing)) # + ``` +- strings can be represented using single or double quotes +- strings in python are **unicode** (not just ascii) +- **escape sequences** like `\n` etc are supported as well +- string concatenation - + ```py + print("hello, " + "shameek") # hello, shameek + ``` +- **formatted strings** - used to **interpolate** variables + ```py + username = "shameek" + password = "keemahs" + print(f'logging in user {username} using password {password}') + # logging in user shameek using password keemahs + ``` +- **string indexing** - remember that python supports negative indices as well + ```py + name = "hello" + print(name[0]) # h + print(name[-1]) # o + print(name[5]) # IndexError: string index out of range + ``` +- **type conversion** - we already saw this in coercion, string interpolation, etc. but we can do this **explicitly** as well - + ```py + variable = 50.456 + print(int(variable)) # 50 + + variable = [1, 2, 3, 4] + print(str(variable)) # [1, 2, 3, 4] + ``` +- example of taking an input from a user. notice how we use type conversion to convert the string input to a float + ```py + kms = float(input("enter kms: ")) + + miles = kms / 1.6 + miles_formatted = round(miles, 2) + + print(f"{kms} kms = {miles_formatted} miles") + ``` + +## Boolean and Conditional Logic + +- **conditional statements** - take different paths based on comparison of input + ```py + if name == "arya stark": + print("all men must die") + elif name == "jon snow": + print("you know nothing") + else: + print("carry on") + ``` +- apart from **comparison operators** that resolve to truthy or falsy, the following examples resolve to falsy as well - + - empty (strings, lists, objects, etc) + - None + - zero + + ```py + if 0: + print("falsy") + else: + print("truthy") # this gets printed + ``` + +- **comparison operators** - `==`, `!=`, `>`, `<`, `>=`, `<=` +- **logical operators** - combine booleans. `and`, `or`, `not` +- `is` vs `==` - feels like the same as **comparing by reference** vs **comparing by value** + ```py + print(1 == 1) # True + print(1 == 1) # True + + print([1, 2, 3] == [1, 2, 3]) # True + print([1, 2, 3] is [1, 2, 3]) # False + + my_list = [1, 2, 3] + my_list_clone = my_list + print(my_list_clone is my_list) # True + ``` + +## Looping + +- **for loops** - loop over a collection of data like every item of a list, every character of a string, etc +- [**iterable object**](#iterators-and-generators) - the collection of data we loop over + ```py + for letter in "coffee": + print(letter) + # c o f f e e + ``` +- **range** - quickly generate numbers in a certain range - + - range(7) - 0 to 6 + - range(1, 8) - 1 to 7 + - range(1, 10, 2) - 1 3 5 7 9 + + ```py + for i in range(1, 5): + print(i) + # 1 2 3 4 + ``` +- **while loop** - continue executing while the conditional statement is truthy + ```py + password = input("enter password: ") + + while password != "bananas": + password = input("wrong, please re-enter password: ") + + print("authenticated successfully!") + ``` +- we also have **break** in python if we need it + +## Lists + +- **lists** - **ordered** collection of items +- it is a **data structure** - a combination of [**data types**](#variables-and-data-types) +- we can add / remove items, reorder items, etc in a list +- a list can contain different data types +- e.g. of using len - + ```py + demo_list = [1, True, 4.5, "bca"] + + print(len(demo_list)) # 4 + ``` +- iterable objects like a range can be converted to a list as well - + ```py + rng = range(1, 4) + print(rng) # range(1, 4) + + lst = list(rng) + print(lst) # [1, 2, 3] + ``` +- **accessing data** - remember that negative indexing is supported in python as well. on exceeding the bounds, we get an index error + ```py + friends = ["Ashley", "Matt", "Michael"] + + print(friends[1]) # Matt + print(friends[3]) # IndexError: list index out of range + + print(friends[-1]) # Michael + print(friends[-4]) # IndexError: list index out of range + ``` +- use **in** too check if a value is present in a list + ```py + print("Ashley" in friends) # True + print("Violet" in friends) # False + ``` +- **iterating** over lists - + ```py + friends = ["Ashley", "Matt", "Michael"] + + for friend in friends: + print(friend) + ``` +- in case we need idx, maybe we can use the following - + ```py + for idx in range(0, len(friends)): + print(f"friends[{idx}] = {friends[idx]}") + ``` +- use **append** for adding a single element / **extend** for adding multiple elements + ```py + nums = [1, 2, 3] + + nums.append(4) + print(nums) # [1, 2, 3, 4] + + nums.extend([5, 6, 7]) + print(nums) # [1, 2, 3, 4, 5, 6, 7] + ``` +- use **insert** to add an element at a specific position + ```py + nums = [1, 2, 3] + + nums.insert(2, 4) + print(nums) # [1, 2, 4, 3] + ``` +- **clear** - delete all items from the list + ```py + nums = [1, 2, 3] + + nums.clear() + print(nums) # [] + ``` +- **pop** - remove the last element / remove element from the specified index + ```py + nums = [1, 2, 3, 4] + + removed_element = nums.pop() + print(f"removed = {removed_element}, nums = {nums}") # removed = 4, nums = [1, 2, 3] + + removed_element = nums.pop(1) + print(f"removed = {removed_element}, nums = {nums}") # removed = 2, nums = [1, 3] + ``` +- **remove** - specify the element to delete, and its first occurrence is removed + ```py + nums = [1, 2, 3, 2, 1] + nums.remove(1) + print(nums) # [2, 3, 2, 1] + ``` +- **index** - return the (first?) index where the specified value is present + - we can specify the range of indices - start and end between which it should look for + - throws an error if not present + + ```py + numbers = [1, 2, 4, 3, 5, 4, 5, 2, 1] + + print(numbers.index(4)) # 2 + print(numbers.index(4, 3, 6)) # 5 + print(numbers.index(21)) # ValueError: 21 is not in list + ``` +- **count** - number of times the element occurs in the list + ```py + numbers = [1, 2, 4, 3, 5, 4, 5, 2, 1] + + print(numbers.count(4)) # 2 + print(numbers.count(21)) # 0 + ``` +- **reverse** to reverse the list - in place +- **sort** - sort the elements, again in place + ```py + numbers = [2, 1, 4, 3] + numbers.sort() + print(numbers) # [1, 2, 3, 4] + ``` +- **join** - concatenate the elements of the string using the specified separator + ```py + words = ["hello", "to", "one", "and", "all", "present"] + sentence = ' '.join(words) + print(sentence) # hello to one and all present + ``` +- **slicing** (works on strings as well) - allows us to make copies. we provide three *optional* pieces of information - start, stop and step + ```py + numbers = [1, 2, 3, 4, 5, 6] + + print(numbers[:]) # [1, 2, 3, 4, 5, 6] + print(numbers[1:]) # [2, 3, 4, 5, 6] + print(numbers[:2]) # [1, 2] + print(numbers[1:5]) # [2, 3, 4, 5] + print(numbers[1:5:2]) # [2, 4] + ``` +- we can use negative steps to go backwards as well when slicing + ```py + nums = [1, 2, 3, 4] + print(nums[::-1]) # [4, 3, 2, 1] + ``` +- shorthand in python for swapping elements of a list - + ```py + numbers = [1, 2, 3] + + numbers[0], numbers[2] = numbers[2], numbers[0] + print(numbers) # [3, 2, 1] + ``` +- destructuring lists - + ```py + a, b, c = [1, 2, 3] + print(f"{a} {b} {c}") + ``` + +### Comprehensions + +- also applicable to tuples etc +- shorthand of doing it via for loop manually. basic syntax - + ```py + nums = [1, 2, 3] + nums_mul_10 = [x * 10 for x in nums] + print(nums_mul_10) # [10, 20, 30] + ``` +- list comprehension with conditionals - + ```py + nums = list(range(1, 10)) + odds = [num for num in nums if num % 2 != 0] + print(odds) # [1, 3, 5, 7, 9] + ``` +- the first condition below determines how to map the element. think of it like a ternary expression. the second condition acts like a filter, like the one we saw in the example above + ```py + nums = list(range(1, 10)) + mapped = ["3x" if num % 3 == 0 else str(num) for num in nums if num % 2 == 1] + print(mapped) # ['1', '3x', '5', '7', '3x'] + ``` +- list comprehension with strings - + ```py + all_characters = "the quick big brown fox jumps over the lazy dog" + vowels = [character for character in all_characters if character in "aeiou"] + print(vowels) # ['e', 'u', 'i', 'i', 'o', 'o', 'u', 'o', 'e', 'e', 'a', 'o'] + ``` +- nested list comprehensions - e.g. we would like to generate a combination of all suits and values for generating cards - + ```py + possible_suits = ("Hearts", "Diamonds", "Clubs", "Spades") + possible_values = ("A", "2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K") + + cards = [f"{value} of {suit}" for suit in possible_suits for value in possible_values] + + print(cards) + + # ['A of Hearts', + # '2 of Hearts', + # '3 of Hearts', + # ... + # 'J of Spades', + # 'Q of Spades', + # 'K of Spades'] + ``` +- note - because we did not surround the first list inside square braces, we got a flattened list automatically. for obtaining a list of lists, we could use the following instead - + ```py + cards = [[f"{value} of {suit}" for suit in possible_suits] for value in possible_values] + + # [['A of Hearts', 'A of Diamonds', 'A of Clubs', 'A of Spades'], + # ... + # ['K of Hearts', 'K of Diamonds', 'K of Clubs', 'K of Spades']] + ``` + +## Dictionaries + +- helps describing data with detail - e.g. item in a shopping cart has attributes like product, quantity +- it uses **key value pairs** - in lists, keys are the indices + ```py + cat = { + "name": "bubbles", + "age": 3.5, + "color": "blue" + } + + print(type(cat)) # + print(cat) # {'name': 'bubbles', 'age': 3.5, 'color': 'blue'} + ``` +- we can pass an iterable of iterables of length 2 to dict as well. it will create a dictionary for us automatically, by using the first element as the key and the second element as the value - + ```py + print(dict([("shameek", 25), ("colt", "45")])) # {'shameek': 25, 'colt': '45'} + ``` +- accessing data - similar to how we do it in lists. notice the `KeyError` if the key is not present + ```py + cat = {"name": "bubbles", "age": 3.5, "color": "blue"} + + print(cat["name"]) # bubbles + print(cat["not_present"]) # KeyError: 'not_present' + ``` +- accessing all elements of dictionary - + ```py + cat = {"name": "bubbles", "age": 3.5, "color": "blue"} + + print(cat.values()) # dict_values(['bubbles', 3.5, 'blue']) + print(cat.keys()) # dict_keys(['name', 'age', 'color']) + print(cat.items()) # dict_items([('name', 'bubbles'), ('age', 3.5), ('color', 'blue')]) + ``` +- now, we can use for loops for the iterables we saw above - + ```py + cat = {"name": "bubbles", "age": 3.5, "color": "blue"} + + for key, value in cat.items(): + print(f'{key} => {value}') + + # name => bubbles + # age => 3.5 + # color => blue + ``` +- check the presence of a key in the dictionary - + ```py + cat = {"name": "bubbles", "age": 3.5, "color": "blue"} + + print("name" in cat) # True + print("phone" in cat) # False + ``` +- check if a value is present in a dictionary - since values returns an iterable data structure, we can use in again, like we used in [**lists**](#lists) + ```py + cat = {"name": "bubbles", "age": 3.5, "color": "blue"} + + print("blue" in cat.values()) # True + print("purple" in cat.values()) # False + ``` +- **clear** - to clear a dictionary +- **copy** - to clone a dictionary. notice the difference in outputs between outputs of `is` vs `==`, discussed [here](#boolean-and-conditional-logic) + ```py + cat = {"name": "bubbles", "age": 3.5, "color": "blue"} + copy_cat = cat.copy() + + print(cat is copy_cat) # False + print(cat == copy_cat) # True + ``` +- **get** - return value if key is present, else return None + ```py + user = {"name": "shameek", "age": 25} + + print(user.get("name")) # shameek + print(user.get("phone")) # None + ``` +- now, get can also accept a default value - + ```py + print(user.get("phone", "+916290885679")) # +916290885679 + ``` +- **pop** - remove the key value pair from the dictionary for the key passed. it also returns the value removed + ```py + user = {"name": "shameek", "age": 25} + + print(user.pop("name")) # shameek + print(user) # {'age': 25} + print(user.pop("email")) # KeyError: 'email' + ``` +- we can add / update values like this - + ```py + user = {"name": "shameek"} + + user["age"] = 25 + user["name"] = "shameek agarwal" + print(user) # {'name': 'shameek agarwal', 'age': 25} + ``` +- **update** - modify value if the key is already present, else add the key value pair to the dictionary + ```py + user = {"first_name": "shameek", "age": 2} + user.update({"last_name": "agarwal", "age": 25}) + print(user) # {'first_name': 'shameek', 'age': 25, 'last_name': 'agarwal'} + ``` +- dictionary comprehension example - look how we obtain both key and value, use `.items` and use curly instead of square braces. rest of the things stay the same + ```py + numbers = {'one': 1, 'two': 2, 'three': 3} + powers = {f'{key}^{value}': value ** value for key, value in numbers.items()} + print(powers) # {'one^1': 1, 'two^2': 4, 'three^3': 27} + ``` +- map values in list 1 to values in another list - + ```py + list1 = ["CA", "NJ", "RI"] + list2 = ["California", "New Jersey", "Rhode Island"] + + answer = {list1[i]: list2[i] for i in range(0,3)} + ``` + +## Tuples + +- difference from list - it is **immutable** - we cannot simply insert / remove elements etc + ```py + numbers = (1, 2, 3, 4) + + print(type(numbers)) # + numbers[0] = 5 # TypeError: 'tuple' object does not support item assignment + ``` +- due to features like immutability, tuples are generally faster than lists +- note - tuples can also be used as keys in a dictionary, while lists cannot. so, tuples are useful if we want to have an ordered collection as a key in a dictionary + ```py + hotels_tuple = { + (23.4, 90.1): "taj", + (75.11, 69.2): "oberoi" + } + + print(hotels_tuple) # {(23.4, 90.1): 'taj', (75.11, 69.2): 'oberoi'} + + hotels_list = { + [23.4, 90.1]: "taj", + [75.11, 69.2]: "oberoi" + } + + print(hotels_list) # TypeError: unhashable type: 'list' + ``` +- accessing elements using square braces, using a for loop to iterate, using methods like count, index, len, slicing, etc work the same way like they do in [**lists**](#lists) +- inter conversions in python is easy - + ```py + my_list = [1, 2, 3] + print(tuple(my_list)) # (1, 2, 3) + + my_tuple = (1, 2, 3) + print(list(my_tuple)) # [1, 2, 3] + ``` +- note - if we want to make a tuple with one element only, python treats it as the parentheses used for explicit priority in mathematical expressions etc. so, put a comma at the end as well + ```py + tuple_incorrect = (1) + print(tuple_incorrect) # 1 + + tuple_correct = (1,) + print(tuple_correct) # (1,) + ``` + +## Sets + +- **no duplicates** +- **no ordering**, so we cannot access elements by index etc +- e.g. of creating a set - + ```py + uniques = {1, 1, 2, 1, 2, 3, 1, 2, 3, 3, 3, 2, 1, 1, 2} + print(uniques) + ``` +- methods like add, remove, etc work like in list whilst ensuring no duplicates +- discard vs remove - discard does not fail, but returns null instead + ```py + numbers = {1, 2, 3} + + print(numbers.discard(4)) # None + print(numbers.remove(4)) # KeyError: 4 + ``` +- set math - things like intersection, union, difference, etc + ```py + nums_a = {1, 2, 3, 4} + nums_b = {3, 4, 5, 6} + + print(nums_a.intersection(nums_b)) # {3, 4} + print(nums_a.union(nums_b)) # {1, 2, 3, 4, 5, 6} + print(nums_a.difference(nums_b)) # {1, 2} + ``` + +## Functions + +- reusable piece of logic, which we can call using different inputs to get different outputs + - helps keep code dry + - helps abstract away complexities +- example of a basic function - notice the default return value is None + ```py + def sing_happy_birthday(): + print("happy birthday dear you") + + + result = sing_happy_birthday() # happy birthday dear you + print(result) # None + ``` +- functions with parameters - + ```py + def sing_happy_birthday(name): + print(f"happy birthday dear {name}") + + + sing_happy_birthday("shameek") # happy birthday dear shameek + ``` +- **parameters** - variables used in the method definitions +- **arguments** - data we pass to the parameters +- **default parameters** - promotes defensive programming, can improve flexibility and readability, e.g. pop in lists pops from end if an index is not specified + ```py + def exponent(base, power=2): + return base ** power + + + print(exponent(5, 3)) # 125 + print(exponent(3)) # 9 + ``` +- **keyword arguments** - what we saw till now is called **positional arguments**. the following style of passing arguments is called **keyword arguments**, and it allows for even more flexibility - + ```py + def exponent(base, power=2): + return base ** power + + + print(exponent(power=5, base=2)) # 32 + ``` +- **scope** - variables created in a function are scoped to that function only + ```py + def speak(): + sound = "hello" + + + speak() + print(sound) # NameError: name 'sound' is not defined. Did you mean: 'round'? + ``` +- **global** - variables not defined inside a function are global. however, we get an error below - + ```py + total = 0 + + + def increment(): + total += 1 # UnboundLocalError: local variable 'total' referenced before assignment + + + increment() + print(total) + ``` +- we need to tell our function at the beginning that it actually refers to the global variable - + ```py + total = 0 + + + def increment(): + global total + total += 1 + + + increment() + print(total) # 1 + ``` +- similarly, for inner functions to access variables of outer functions, we use **non local** - + ```py + def outer(): + counter = 1 + + def inner(): + nonlocal counter + counter += 1 + + inner() + + return counter + + + print(outer()) + ``` +- python example to check if a string is a palindrome. note - "a man a plan a canal Panama" is a palindrome as well - since we ignore white spaces and want case insensitive + ```py + def is_palindrome(sentence): + characters = [x.lower() for x in sentence if x != ' '] + return characters == list(reversed(characters)) + # or return characters == characters[::-1] + ``` + +### Args, Kwargs and Unpacking + +- `*args` - allows us to pass **variable number** of **positional arguments** + ```py + def sum_except_first(num1, *args): + print(f"skipping {num1}") + return sum(args) + + + print(sum_except_first(1)) # 0 + print(sum_except_first(1, 2, 3, 4)) # 9 + ``` +- `**kwargs` - allows us to pass **variable number** of **keyword arguments** + ```py + def fav_colors(**kwargs): + print(kwargs) + + + fav_colors(shameek="red", colt="purple") # {'shameek': 'red', 'colt': 'purple'} + ``` +- e.g. use case - combine a word with its prefix and suffix if provided - + ```py + # Define combine_words below: + def combine_words(word, **kwargs): + return kwargs.get("prefix", "") + word + kwargs.get("suffix", "") + + + print(combine_words("child")) # 'child' + print(combine_words("child", prefix="man")) # 'manchild' + print(combine_words("child", suffix="ish")) # 'childish' + print(combine_words("work", suffix="er")) # 'worker' + print(combine_words("work", prefix="home")) # 'homework' + ``` +- note - args and kwargs are just conventions inside python, we can name them differently as well +- the order of parameters should be as follows - + - normal parameters + - `*args` + - default parameters + - `**kwargs` +- **unpacking args** - we can unpack the arguments in a list while passing it to a function as follows - + ```py + def unpack_add(a, b, c): + return a + b + c + + + numbers = [1, 2, 3] + print(unpack_add(*numbers)) + ``` +- now, we can extend this functionality to `*args` as well. when we pass a list without unpacking, args ends up being a tuple, with the first argument as the list itself. however, we get the desired functionality when we unpack the list while passing it to the function + ```py + def adder(*args): + return sum(args) + + + numbers = [1, 2, 3, 4] + print(adder(numbers)) # TypeError: unsupported operand type(s) for +: 'int' and 'list' + print(adder(*numbers)) # 10 + ``` +- similarly, we can unpack dictionaries as well - + ```py + def get_display_name(first_name, last_name): + return f"{first_name} {last_name}" + + + user = {"first_name": "shameek", "last_name": "agarwal"} + + print(get_display_name(**user)) # shameek agarwal + ``` +- notice how though unpacking and args / kwargs can be combined, they are separate things +- combining unpacking and kwargs - + ```py + def get_display_name(**kwargs): + return f"{kwargs.get('first_name')} {kwargs.get('last_name')}" + + + user = {"first_name": "shameek", "last_name": "agarwal"} + + print(get_display_name(**user)) # shameek agarwal + ``` + +### Lambdas & Builtin Functions + +- **lambdas** - functions that are short, one line expressions + ```py + square = lambda num: num ** 2 + add = lambda a, b: a + b + + print(square(3)) # 9 + print(add(4, 9)) # 13 + ``` +- lambdas are useful when we for e.g. want to pass small functions as a callback to other functions +- **map** - accepts a function and an iterable. it then runs the function for each value in the iterable +- my understanding - it returns a map object which while iterable, has limited functionality. that is why we again convert it to a list. this is a common theme in all functions we see now - zip returns zip object, map returns map object and so on. we convert these special objects to a list manually + ```py + numbers = [1, 2, 3, 4] + doubled = list(map(lambda x: x * 2, numbers)) + print(doubled) + ``` +- **filter** - filter out elements of the iterable that do not satisfy the condition +- it is possible to do this map and filter using [comprehensions](#comprehensions) as well, which is a bit more readable. it depends on use case +- **all** - return true if all elements of the iterable are truthy. if iterable is empty, return true +- **any** - return true if any element of the iterable is truthy. if iterable is empty, return false + ```py + numbers = [1, 2, 3, 4] + print([num > 0 for num in numbers]) # [True, True, True, True] + + print(all([num > 0 for num in numbers])) # True + print(all([num > 1 for num in numbers])) # False + + print(any([num > 0 for num in numbers])) # True + print(any([num > 4 for num in numbers])) # False + ``` +- **sorted** - accept an iterable and returns a new iterable with the sorted elements. notice the difference between sorted and the **sort** we saw in [lists](#lists) - sorted is not in place, sort is + ```py + numbers = [1, 2, 3, 4] + + print(sorted(numbers)) # [1, 2, 3, 4] + print(f'stays the same: {numbers}') # stays the same: [1, 2, 3, 4] + print(sorted(numbers, reverse=True)) # [4, 3, 2, 1] + ``` +- specify custom sorting logic - + ```py + users = [ + {"username": "samuel", "tweets": ["I love cake", "I love pie", "hello world!"]}, + {"username": "katie", "tweets": ["I love my cat"]}, + {"username": "jeff", "tweets": [], "color": "purple"}, + {"username": "bob123", "tweets": [], "num": 10, "color": "teal"}, + {"username": "doggo_luvr", "tweets": ["dogs are the best", "I'm hungry"]}, + {"username": "guitar_gal", "tweets": []} + ] + + print(sorted(users, key=lambda user: user["username"])) + # [ + # {'username': 'bob123', 'tweets': [], 'num': 10, 'color': 'teal'}, + # {'username': 'doggo_luvr', 'tweets': ['dogs are the best', "I'm hungry"]}, + # {'username': 'guitar_gal', 'tweets': []}, + # {'username': 'jeff', 'tweets': [], 'color': 'purple'}, + # {'username': 'katie', 'tweets': ['I love my cat']}, + # {'username': 'samuel', 'tweets': ['I love cake', 'I love pie', 'hello world!']} + # ] + ``` +- **max** - find the max in iterable etc. i think works for *args as well based on the first example + ```py + print(max(3, 1, 4, 2)) # 4 + print(max([3, 1, 4, 2])) # 4 + ``` +- custom logic for max - + ```py + names = ['arya', 'samson', 'tim', 'dory', 'oleander'] + print(max(names, key=lambda name: len(name))) # oleander + ``` +- **reversed** - again, unlike the **reverse** we saw in [lists](#lists), this does not do it in place + ```py + numbers = [1, 2, 3, 4] + print(list(reversed(numbers))) # [4, 3, 2, 1] + + for i in reversed(range(5)): + print(i) + # 4 3 2 1 0 + ``` +- **len** - length of iterable. e.g. calling it on a dictionary will return the number of keys it has - + ```py + print(len({"name": "shameek", "age": 25, "profession": "IT"})) # 3 + print(len([1, 2, 3, 4, 5])) # 5 + ``` +- **abs**, **round**, **sum** - all self explanatory. notice how we can provide sum with an initial value as well + ```py + print(abs(-4)) # 4 + print(abs(4)) # 4 + + print(sum([1, 2, 3, 4], 5)) # 15 + print(sum((2.0, 4.5))) # 6.5 + + print(round(5.4123, 2)) # 5.41 + print(round(1.2, 3)) # 1.2 + ``` +- **zip** - makes an iterator that aggregates elements from each of the iterators i.e. ith tuple contains the ith element from each of the iterator. the iterator stops when the shortest iterator is exhausted + ```py + numbers = [1, 2, 3, 4, 5] + squares = [1, 4, 9] + print(zip(numbers, squares)) # + print(list(zip(numbers, squares))) # [(1, 1), (2, 4), (3, 9)] + ``` +- a slightly complex example of combining zip with [unpacking](#args-kwargs-and-unpacking). we unpacks the list, and it essentially means we are passing several tuples to zip. so, first element of all tuples are combined to form the first element, and second element of all tuples are combined to form the second element + ```py + tuples = [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)] + print(list(zip(*tuples))) # [(1, 3, 5, 7, 9), (2, 4, 6, 8, 10)] + ``` +- e.g. we have a list of students, and their attempts in two exams. we want a dictionary keyed by student names, and their final score which is the best of the two attempts - + ```py + # question + attempt_1 = [80, 91, 78] + attempt_2 = [98, 89, 53] + students = ["dan", "ang", "kate"] + + # solution + final_scores = map(max, zip(attempt_1, attempt_2)) + final_scores_by_student = dict(zip(students, final_scores)) + print(final_scores_by_student) # {'dan': 98, 'ang': 91, 'kate': 78} + ``` + +## Error Handling + +- we can raise our own error using `raise` + ```py + raise ValueError("a value error") # ValueError: a value error + ``` +- if we raise an error like this, the code execution stops immediately. we can use try blocks to handle errors and then continue the program execution + ```py + try: + foobar + except: + print("an error occurred") + print("after try block") + + # an error occurred + # after try block + ``` +- the above is an example of a **catch all** block - it will handle all errors the same way, which is not advisable. we can handle specific errors as follows - + ```py + def get(d, key): + try: + return d[key] + except KeyError: + print("key does not exist") + return None + + + user = {"name": "shameek"} + + print(get(user, "name")) # shameek + print(get(user, "phone")) # key does not exist, None + ``` +- if the whole **try** block runs successfully, the **else** part is executed, otherwise the except part is executed if an error matches +- **finally** is always executed no matter what +- we can capture the actual error using **as** + ```py + def divide(a, b): + try: + result = a / b + except TypeError as err: + print("arguments should be ints or floats") + print(err) + except ZeroDivisionError: + print("do not divide by zero") + else: + print(f"{a}/{b} = {result}") + finally: + print("execution complete") + ``` + - divide(5, 2) - + ``` + 5/2 = 2.5 + execution complete + ``` + - divide("a", 2) + ``` + arguments should be ints or floats + unsupported operand type(s) for /: 'str' and 'int' + execution complete + ``` + - divide(5, 0) + ``` + do not divide by zero + execution complete + ``` +- catching multiple errors using a single catch block - + ```py + except (TypeError, ZeroDivisionError) as err: + print(err) + ``` + +## Debugging + +- debugging using ides is straightforward, we just create breakpoints and run the program in debug mode +- but we can also use a tool called **pdb** - **python debugger** +- the code up to before `pdb.set_trace()` + ```py + import pdb + + first = "shameek" + last = "agarwal" + + pdb.set_trace() + + prefix = "mr." + greeting = f"hi {prefix} {first} {last}, how can i help you?" + print(greeting) + ``` +- common commands - + - **n** - **next** - i think this is like step over in intellij + - **c** - **continue** - i think it continues execution normally, and maybe stops at the next breakpoint? + - **l** - shows us the line in code where the execution is paused +- we can enter expressions like we do in the top bar in intellij to inspect variables, evaluate expressions, etc + +## Modules + +- **reuse code** across different files by importing, improves readability, etc +- **built in modules** - come with python, so we do not need to download them. but, we do need to import them explicitly to be able to use them + ```py + import random + + print(random.choice(["rock", "paper", "scissors"])) + ``` +- we can alias the import to use it under a different name as well + ```py + import random as rand + + print(rand.choice(["rock", "paper", "scissors"])) + ``` +- instead of importing everything, we can also import only parts that we need. note - also refer below for different functionality inside random module - + ```py + from random import choice, randint, shuffle + + print(choice(["rock", "paper", "scissors"])) # paper + print(randint(0, 2)) # 2 + + numbers = [1, 2, 3, 4, 5] + shuffle(numbers) + print(numbers) # [5, 4, 1, 2, 3] + ``` +- note - we can use `*` to import everything, but this is generally not advisable +- we can also alias these specific parts just like we aliased `random` using `rand` +- **custom modules** - we can simply import functions without exporting them - + - bananas.py - + ```py + def get_banana(): + return "yummy banana dipped in chocolate" + ``` + - modules.py - + ```py + import bananas + + print(bananas.get_banana()) + ``` +- **external modules** - we can download them from pypi - **python package index**. they are 3rd party modules which we can use +- command to install - `python -m pip install autopep8` +- **autopep8** - formats python code. to format, use - `autopep8 --in-place --aggressive --aggressive external_modules.py` +- note - above, we configure aggressiveness with level 2. a lower level would for e.g. only make whitespace changes and nothing else + +### name + +- `__name__` - it is set to `__main__` if the current file is run i.e. we use `python3 file.py`, else it is set to the name of the file +- say_sup.py - + ```py + def say_sup(): + print(f"sup! i am in {__name__}") + + + say_sup() + ``` +- say_hi.py - + ```py + from say_sup import say_sup + + + def say_hi(): + print(f"hi! i am in {__name__}") + + + say_hi() + say_sup() + ``` +- output - + ``` + sup! i am in say_sup + hi! i am in __main__ + sup! i am in say_sup + ``` +- output line 1 - the code inside the module(s) being imported are run first. say_sup.py is run, which calls `say_sup` +- output line 2 and 3 - current file is being executed. say_hi.py is run, where we call `say_hi` and `say_sup` +- to prevent line 1, we can change say_sup.py as follows - + ```py + def say_sup(): + print(f"sup! i am in {__name__}") + + + if __name__ == "__main__": + say_sup() + ``` +- now, running say_hi.py gives the following output - + ``` + hi! i am in __main__ + sup! i am in say_sup + ``` +- while running say_sup.py gives continues to give the following output - + ``` + sup! i am in __main__ + ``` + +## HTTP Requests + +- **requests** - the library that is commonly used for making requests +- when we call `response.json()`, the response is converted to a python dictionary +- an example combining requests parameters etc below + +```py +import requests +from random import choice + +url = "https://icanhazdadjoke.com/search" + +topic = input("enter topic to get jokes for: ") + +response = requests.get( + url, + headers={"Accept": "application/json"}, + params={"term": topic} +) + +if response.status_code == 200: + + data = response.json() + jokes = list(map(lambda joke: joke["joke"], data['results'])) + + if len(jokes) > 0: + print(f"i got {len(jokes)} joke(s) for the topic you searched for. here is one:") + print(choice(data["results"])["joke"]) + else: + print(f"uh oh, no jokes found for '{topic}', try another topic") + +# enter topic to get jokes for: rat +# i got 2 joke(s) for the topic you searched for. here is one: +# Why couldn't the kid see the pirate movie? Because it was rated arrr! +``` + +## Object Oriented Programming + +- **classes** - attempts to model anything in the real world that is tangible (or non-tangible) via programming +- **classes** are like blueprints for **objects**. **objects** are instances of a **class** +- when we were creating lists or even int, we were basically creating objects of int / list classes +- goal - make a hierarchy of the classes after identifying the different entities +- note - visibility modifiers like private etc are not supported by python - so, we prefix variables and methods not meant to be touched from outside the class with underscores instead +- defining a class. note - `pass` acts like a placeholder, it helps us stay syntactically correct, and the idea is that we revisit it later + ```py + class User: + pass + ``` +- creating objects for this class - + ```py + user1 = User() + print(user1) # <__main__.User object at 0x77e693863040> + ``` +- **self** - refers to the instance. technically, we can name it something else, but self is pretty much the standard everywhere +- self must be the first parameter to all the methods of a class +- **init** - called when we instantiate the class + ```py + class User: + def __init__(self, name): + self.name = name + + + user1 = User("shameek", 25) + print(user1.name) # shameek + ``` +- convention - starting and ending with `__` is used to for e.g. override methods built into python +- so, for custom private methods / variables, we can prefix with a single `_` +- **name mangling** - when we prefix attributes with a `__`, python internally prepends it with the class name. helps distinguish in case they are overridden by child class. this has been discussed later + ```py + class User: + def __init__(self, name, age): + self.name = name + self.age = age + self._secret = "hint (convention): do not access me directly" + self.__profession = "unemployed" + + + user1 = User("shameek", 25) + print(user1._secret) # hint (convention): do not access me directly + print(user1._User__profession) # unemployed + print(user1.__profession) # AttributeError: 'User' object has no attribute '__profession'. Did you mean: '_User__profession'? + ``` +- adding **instance methods** - + ```py + # .... + def greeting(self): + return f"hi {self.name}!" + + print(user1.greeting()) # hi shameek! + ``` +- till now, we have seen **instance attributes** and **instance methods**, now we discuss **class attributes** and **class methods** +- class attributes / methods exist directly on the class and are shared across instances +- defining class attributes - + ```py + class User: + active_users = 0 + + # ... + ``` +- accessing class attributes from instance methods or outside - + ```py + # ... + def __init__(self, name, age): + self.name = name + self.age = age + User.active_users += 1 + + print(f"active users = {User.active_users}") # active users = 0 + user1 = User("shameek", 25) + user2 = User("colt", 50) + print(f"active users = {User.active_users}") # active users = 2 + ``` +- all objects in python get their unique id which python assigns. we can check that both users point too the same active_users int object as follows - + ```py + print(id(user1.active_users)) # 134256650092816 + print(id(user2.active_users)) # 134256650092816 + ``` +- note - above shows that we can access class attributes via the instance as well. even self inside the class can be used to access the class attributes. accessing via the class however, improves readability +- class methods - decorate with `@classmethod`. the first argument it receives is **cls** and not self. look at the print statements below to understand the difference + ```py + class User: + active_users = 0 + + def __init__(self, name, age): + print(self) + self.name = name + self.age = age + User.active_users += 1 + + @classmethod + def get_active_users(cls): + print(cls) + return cls.active_users + + + user1 = User("shameek", 25) + user2 = User("colt", 50) + print(User.get_active_users()) + + # <__main__.User object at 0x718a83b63eb0> + # <__main__.User object at 0x718a83b63e50> + # + # 2 + ``` +- another example, like a factory method - + ```py + # ... + + @classmethod + def create(cls, csv_row): + name, age = csv_row.split(",") + return cls(name, age) + + user3 = User.create("shameek,25") + print(user3.name) # shameek + print(user3.age) # 25 + ``` +- **repr** is one of the several ways to provide a string representation - + ```py + # ... + def __repr__(self): + return f"{self.name} aged {self.age}" + + print(user3) # shameek aged 25 + string_repr = str(user3) + print(string_repr) # shameek aged 25 + ``` +- **properties** - helps use getter and setter methods underneath, while clients interact with them like normal attributes. advantage - when our getter / setter logic has some complexity underneath and simple assignment / accessing is not enough + ```py + class Human: + def __init__(self, first_name, last_name): + self.first_name = first_name + self.last_name = last_name + + @property + def full_name(self): + return f"{self.first_name} {self.last_name}" + + @full_name.setter + def full_name(self, full_name): + self.first_name, self.last_name = full_name.split(" ") + + + shameek = Human("", "") + print(f"{shameek.first_name}, {shameek.last_name}, {shameek.full_name}") # , , + + shameek.full_name = "shameek agarwal" + print(f"{shameek.first_name}, {shameek.last_name}, {shameek.full_name}") # shameek, agarwal, shameek agarwal + ``` +- there is a handy **dict** attribute we can access to look at the instance attributes of the class - + ```py + class Human: + def __init__(self, first_name, last_name): + self.first_name = first_name + self.last_name = last_name + + + human = Human("shameek", "agarwal") + print(human.__dict__) # {'first_name': 'shameek', 'last_name': 'agarwal'} + ``` + +### Inheritance + +- notice how instance variables / methods of superclass are accessible from child class - + ```py + class Animal: + + def __init__(self): + self.is_animal = True + + def make_sound(self, sound): + print(f"i say {sound}") + + + class Cat(Animal): + pass + + + cat = Cat() + print(cat.is_animal) # True + cat.make_sound("meow") # i say meow + ``` +- **is instance** returns true for the parent class as well + ```py + print(isinstance(cat, Cat)) # True + print(isinstance(cat, Animal)) # True + ``` +- calling superclass init from subclass + ```py + class Animal: + + def __init__(self, species, name): + self.species = species + self.name = name + + def __repr__(self): + return f"{self.name} is a {self.species}" + + + class Cat(Animal): + + def __init__(self, name, breed, favourite_toy): + super().__init__("cat", name) + self.breed = breed + self.favourite_toy = favourite_toy + + + blue = Cat("blue", "scottish fold", "string") + print(blue) # blue is a cat + ``` +- multiple inheritance explained with output - + ```py + class Aquatic: + def __init__(self, name): + print("init of aquatic") + self.name = name + + def swim(self): + print(f"{self.name} is swimming") + + def greet(self): + print(f"{self.name}, king of the ocean") + + + class Ambulatory: + def __init__(self, name): + print("init of ambulatory") + self.name = name + + def walk(self): + print(f"{self.name} is walking") + + def greet(self): + print(f"{self.name}, king of the land") + + + class Penguin(Aquatic, Ambulatory): + def __init__(self): + print("init of penguin") + super().__init__("pingu") + + + pingu = Penguin() # init of penguin, init of aquatic + + pingu.swim() # pingu is swimming + pingu.walk() # pingu is walking + + pingu.greet() # pingu, king of the ocean + ``` +- instance methods from both are inherited - we are able to call both walk and swim +- in cases of instance methods like greet defined in both, aquatic is taking preference +- aquatic's init is being called when we use super in subclass +- **mro** or **method resolution order** - the order in which python is going to look for methods +- the underlying algorithm is complex, but we can inspect it using the mro method on classes + ```py + print(Penguin.__mro__) + # (, , , ) + ``` +- so maybe this decides what order to traverse superclasses in when super is used / what superclass will be ultimately used when an instance method is referenced +- as a resolve, e.g. if we want to call init for both classes, instead of using super, we can reference the class directly + ```py + # ... + class Penguin(Aquatic, Ambulatory): + + def __init__(self): + print("init of penguin") + # super().__init__("pingu") + Aquatic.__init__(self, "pingu") + Ambulatory.__init__(self, "pingu") + # ... + + pingu = Penguin() # init of penguin, init of aquatic, init of ambulatory + ``` +- now, if we understand mro further - if the init in our superclass is enough - we can skip the init in the subclass altogether, because due to mro, the superclass init will be automatically called by python if its subclass does not have an init + +### Polymorphism + +- **method overriding** - method in superclass rewritten in subclass +- **polymorphism** - same method works in different ways depending on the object + ```py + class Animal: + def speak(self): + raise NotImplementedError("subclasses need to override this method") + + + class Dog(Animal): + def speak(self): + return "woof" + + + class Cat(Animal): + def speak(self): + return "meow" + + + dog = Dog() + print(dog.speak()) # woof + + cat = Cat() + print(cat.speak()) # meow + ``` +- probably an extension of this is seen in **magic methods** - when we use `+`, it behaves differently depending on the [**data type**](#variables-and-data-types) - int vs string. `+` calls `__add__` underneath. `len` works in a similar manner + ```py + class Human: + + def __init__(self, name, height): + self.name = name + self.height = height + + def __len__(self): + return self.height + + def __add__(self, other): + return Human("newborn", self.height + other.height) + + + human = Human("kevin", 65) + print(len(human)) # 65 + + new_human = human + Human("jenny", 60) + print(len(new_human)) # 125 + ``` + +## Iterators and Generators + +- **iterator** - it returns one element at a time when **next** is called on it +- **iterable** - an object that can return an iterator +- at the end, **stop iteration** error is raised +- this is also the working mechanism of for each loops that we use +- a custom for loop implementation - + ```py + def custom_for(iterable, function): + custom_iterator = iter(iterable) + + while True: + try: + element = next(custom_iterator) + except StopIteration: + print("end of iteration...") + break + else: + function(element) + + + custom_for("hey", print) + + # h + # e + # y + # end of iteration... + ``` +- making a custom class as iterable - + ```py + class Counter: + + def __init__(self, low, high): + self.low = low + self.high = high + + def __iter__(self): + return iter(range(self.low, self.high)) + + + counter = Counter(1, 5) + for x in counter: + print(x, end=' ') # 1 2 3 4 + ``` +- going a step further and customizing next. my understanding - calling `iter()` will now give the counter instance itself. now, python will keep calling next on the iterator. basically, our iterator and iterable instances are the same now + ```py + def __iter__(self): + return self + + def __next__(self): + if self.low < self.high: + self.low += 1 + return self.low - 1 + raise StopIteration + ``` +- one issue i felt with the approach above - we can only iterate through it once. reason - maybe because we return the "same iterator instance" every time, once the low becomes equal to high, we cannot start iterating once again. e.g. see how the second call to iterate does not print anything - + ```py + def iterate(custom_iterator): + for x in custom_iterator: + print(x, end=' ') + print() + + + counter = Counter(1, 5) + iterate(counter) # 1 2 3 4 + iterate(counter) # + ``` +- so, i instead tried returning a copy of the current instance from `__iter__`, and it worked - + ```py + from copy import copy + + # ... + + def __iter__(self): + return copy(self) + + # ... + + iterate(counter) # 1 2 3 4 + iterate(counter) # 1 2 3 4 + ``` +- **generators** - a subset of **iterators** +- **generator functions** - use **yield** instead of return keyword, and can yield **multiple** times +- once python sees the yield keyword anywhere in the function, it knows that it needs to return a **generator** +- summary - generator functions return a generator, and generators are a type of iterator +- yield is like a pause - the function stops executing, and resumes from after the yield statement when **next** is called on the generator again +- i am guessing that the usual `StopIteration` is raised when the end of function is reached and no more yields are found + ```py + def counter(up_to): + count = 1 + + while count <= up_to: + yield count + count += 1 + + + three_counter = counter(3) + + print(three_counter) # + + print(next(three_counter)) # 1 + print(next(three_counter)) # 2 + print(next(three_counter)) # 3 + print(next(three_counter)) # StopIteration + ``` +- we can also use our usual for loop now, since a generator is an iterator - + ```py + three_counter = counter(3) + + for i in three_counter: + print(i, end=' ') # 1 2 3 + ``` +- an example of a use case for a generator - assume as a client, i were to iterate over all fibonacci numbers from 1 to n by making a call to a library, so the library has to return an iterator + - option 1 - return a populated list + - option 2 - use a generator +- to toggle between the options below, comment / uncomment lines related to result / yield. we measure the memory usage between the two. reason for the difference - generators only need to store the state of the current execution's variables, while the list needs to be pre populated entirely upfront + ```py + import resource + + + def fibonacci(n): + x, y, count = 0, 1, 1 + # result = [] + + while count <= n: + yield x + # result.append(x) + x, y = y, x + y + count += 1 + + # return result + + + def show_usage(label): + usage = resource.getrusage(resource.RUSAGE_SELF) + print(f"{label}: mem={usage[2] / 1024.0}mb") + + + show_usage("before") + for i in fibonacci(100000): + pass + show_usage("after") + ``` + ![generators iterators](/assets/img/python-basics/generators-iterators.png) +- **generator expressions** - it is a shorter way of using generators compared to generator functions + - using generator functions - + ```py + def get_multiples(base=1, number_of_multiples=10): + result = base + + while number_of_multiples > 0: + yield result + result += base + number_of_multiples -= 1 + + + for i in get_multiples(2, 3): + print(i) + ``` + - using generator expressions - + ```py + def get_multiples(base=1, number_of_multiples=10): + return (2 * multiple for multiple in range(1, 4)) + + + for i in get_multiples(2, 3): + print(i) + ``` +- in general - prefer generators if possible instead of lists etc - using generators would be more efficient in terms of memory, performance, etc, if we only want to iterate through it once. use lists if we want to perform complex operations like append etc further down the line - + ```py + print(sum(i for i in range(100000000))) # faster / more optimal than + print(sum([i for i in range(100000000)])) + ``` + +## File IO + +- **reading files** - we read files using **open** function, which returns a **file** object. then, the file object can be used to access metadata / access its content using **read** + ```py + file = open("story.txt") + content = file.read() + + print(content) # prints the contents of the file + ``` +- if we call read twice, the second read returns an empty string, because the **cursor** has already reached the end of the file after the first read + ```py + file = open("story.txt") + print("read1: ", file.read()) # read1: contents of the file + print("read2: ", file.read()) # read2: <> + ``` +- this also means that for e.g. if we add two lines to story.txt between the first and the second read, the second read will only display these two new lines +- we can use **seek** to set the position of the cursor + ```py + file = open("story.txt") + print("read1: ", file.read()) # read1: contents of the file + file.seek(0) + print("read2: ", file.read()) # read2: contents of the file + ``` +- **read line** - read line by line - it reads till a new line character is encountered - + ```py + file = open("story.txt") + print(f"line 1: {file.readline()}") + print(f"line 2: {file.readline()}") + print(f"line 3: {file.readline()}") + ``` +- **read lines** - returns us a list, where each element represents a line in the file +- we need to close files manually to avoid using system resources - `file.close()` +- **with** blocks - we do not have to handle closing of resource etc when using with blocks - + ```py + with open("story.txt") as file: + lines = file.readlines() + print(f"total lines in file = {len(lines)}") + ``` +- **writing to files** -it also creates the file anew if it does not already exist + ```py + with open("story.txt", "w") as file: + file.write("this was added via python\n") + file.write("this overwrites the file completely") + ``` +- write also overwrites the file completely, and does not append to the end. to append to the end, use "a" for the mode flag +- we can use "r+" for mode to both read and write simultaneously + +## Pickling + +- imagine we have a class as follows - + ```py + class Human: + + def __init__(self, name, age): + self.name = name + self.age = age + + def celebrate_birthday(self): + print(f"happy birthday {self.name}") + self.age += 1 + ``` +- **pickling** serializing and storing the state of python objects. use case - saving the state across application restarts etc + ```py + shameek = Human("shameek", 24) + + with open("human.pickle", "wb") as file: + pickle.dump(shameek, file) + ``` +- **unpickling** is the reverse, deserialization process. understand how python is constructing an object of the right class for us, so we are able to interact with instance methods etc as well + ```py + with open("human.pickle", "rb") as file: + shameek = pickle.load(file) + shameek.celebrate_birthday() # happy birthday shameek + print(f"{shameek.name} aged {shameek.age}") # shameek aged 25 + ``` +- simply change method calls to `pickle.dump(iterable, file)` etc when interacting with a collection +- [**jsonpickle**](https://pypi.org/project/jsonpickle/) - the pickle library works with binary - e.g. the modes we used were **rb** / **wb**, etc. if we want the file to be json instead, we use this library. advantage - readable, useful for serving via rest api etc. disadvantage - less efficient compared to binary + +## CSV + +- we use [file io](#file-io) in combination with the **csv module** to interact with csvs +- if we use he **reader**, each row is represented as a list of strings. first row is included as well. the trick used here is to manually call next once on the iterator + ```py + with open("fighters.csv") as file: + fighters_csv = reader(file) + header = next(fighters_csv) + + for row in fighters_csv: + print(row) + + # ['Ryu', 'Japan', '175'] + # ['Ken', 'USA', '175'] + # ['Chun-Li', 'China', '165'] + # ['Guile', 'USA', '182'] + ``` +- if we use **dict reader**, each row is represented as a dictionary. keys are constructed using the first row + ```py + with open("fighters.csv") as file: + fighters_csv = DictReader(file) + + for row in fighters_csv: + print(row) + + # {'Name': 'Ryu', 'Country': 'Japan', 'Height (in cm)': '175'} + # {'Name': 'Ken', 'Country': 'USA', 'Height (in cm)': '175'} + # {'Name': 'Chun-Li', 'Country': 'China', 'Height (in cm)': '165'} + # {'Name': 'Guile', 'Country': 'USA', 'Height (in cm)': '182'} + ``` +- writing to csv files - since i wanted to just add a row and not overwrite the row entirely, i opened it in **append mode**. opening using **write mode** would have overwritten the file entirely with just the one row that i specified - + ```py + with open("fighters.csv", "a") as file: + fighters_csv = writer(file) + fighters_csv.writerow(["Shameek", "India", "165"]) + ``` +- writing using **dict writer** - + ```py + with open("people.csv", "w") as file: + + fieldnames = ["name", "age"] + + fighters_csv = DictWriter(file, fieldnames=fieldnames) + fighters_csv.writeheader() + + fighters_csv.writerow({"name": "shameek", "age": 25}) + fighters_csv.writerow({"name": "colt", "age": 50}) + ``` +- assume csv has two columns - first and last name. return the row number of the row that matches the given values + ```py + import csv + + def find_user(first_name, last_name): + with open("users.csv") as file: + csv_reader = csv.reader(file) + header = next(csv_reader) + for index, row in enumerate(csv_reader): + if row[0] == first_name and row[1] == last_name: + return index + 1 + return 'Not Here not found.' + ``` +- note, my understanding - we should iterate one by one for efficiency since it is an iterator, instead of converting the iterator to a list. also, look at how we use the enumerate function + +## Decorators + +- **decorators** are **higher order functions** i.e. functions that wrap other functions to enhance their behavior +- doing this manually - e.g. we create a polite version of our introduction function + ```py + def be_polite(fn): + def wrapped(): + print("what a pleasure to meet you") + fn() + print("have a great day") + + return wrapped + + + def introduction(): + print("my name is shameek") + + + polite_introduction = be_polite(introduction) + polite_introduction() + + # what a pleasure to meet you + # my name is shameek + # have a great day + ``` +- using decorators - i just need to annotate `introduction` with `be_polite`, and python takes care of the rest. notice how we simply call introduction now for the same functionality + ```py + @be_polite + def introduction(): + print("my name is shameek") + + introduction() + + # what a pleasure to meet you + # my name is shameek + # have a great day + ``` +- right now, `introduction` does not accept any arguments, therefore `wrapped` could also stay empty. what if we had multiple functions with different **method signatures**? how can we make the `wrapped` returned from `be_polite` flexible? using [**args and kwargs**](#args-kwargs-and-unpacking) + ```py + def be_polite(fn): + def wrapped(*args, **kwargs): + print("what a pleasure to meet you") + result = fn(*args, **kwargs) + print("have a great day") + return result + + return wrapped + + + @be_polite + def introduction(): + print("my name is shameek") + + + @be_polite + def greet(name, age): + print(f"i am {name} aged {age}, you?") + + + introduction() # what a pleasure to meet you | my name is shameek | have a great day + greet("shameek", 25) # what a pleasure to meet you | i am shameek aged 25, you? | have a great day + ``` +- one problem - when we print the name of the decorated function, try accessing the docstring of the decorated function, etc - we see details for `wrapped`, and not `introduction` - + ```py + print(introduction.__name__) # wrapped + ``` +- solution - we use **wraps** decorator on wrapped + ```py + from functools import wraps + + + def be_polite(fn): + @wraps(fn) + def wrapped(*args, **kwargs): + # ... + + print(introduction.__name__) # introduction + ``` + +- a practical example - benchmarking to see difference between lists and generators - + ```py + import time + from functools import wraps + + + def speed_test(fn): + @wraps(fn) + def wrapped(*args, **kwargs): + start_time = time.time() + print(f"executing {fn.__name__}") + result = fn(*args, **kwargs) + time_taken = time.time() - start_time + print(f"time taken: {time_taken}") + return result + + return wrapped + + + @speed_test + def using_lists(end): + return sum([i * i for i in range(end)]) + + + @speed_test + def using_generators(end): + return sum(i * i for i in range(end)) + + + print(using_lists(100_000_000)) + print(using_generators(100_000_000)) + ``` + ![python basics](/assets/img/python-basics/decorators.png) + +### Decorators with Arguments + +- my understanding - till now, we were using this - `@decorator` +- but now we want to do this - `@decorator(arg1, arg2, ...)` +- so, it is almost like now, we are making a function call. so, another layer of function needs to be returned +- below is a complex example i took a stab at, so not sure about the correctness 😛 +- we want to ensure the types of arguments passed to a function using a decorator - + ```py + from functools import wraps + + + def enforce(*types): + def outer_wrapper(fn): + @wraps(fn) + def inner_wrapper(*args, **kwargs): + arg_types = types[:len(args)] + kwarg_types = types[len(args):] + + arg_types_match = all(isinstance(arg, type_of_arg) for type_of_arg, arg in zip(arg_types, args)) + kwarg_types_match = all(isinstance(arg, type_of_kwarg) for type_of_kwarg, arg in zip(kwarg_types, kwargs.values())) + + if (not arg_types_match) or (not kwarg_types_match): + raise ValueError("argument types do not match") + + return fn(*args, **kwargs) + + return inner_wrapper + + return outer_wrapper + ``` +- e.g. of using this decorator - the second one fails because the type expected for the second argument is integer, while we send in a string - + ```py + @enforce(str, int) + def printer(name, age): + print(f"i am {name} aged {age}") + + printer("shameek", age=25) # i am shameek aged 25 + printer("shameek", age="25") # ValueError: argument types do not match + ``` + +## Testing + +- helps reduce bugs - e.g. when changes are made to existing code that results in unintended effects. our tests can help catch these bugs early +- **tdd** or **test driven development** - write tests first, and write code to have these tests pass +- we can use **assert** to make assertions - it returns None if the expression is truthy, raises an AssertionError otherwise. we can also specify the error message to use inside the assertion error + ```py + assert 1 == 1 + assert 1 == 2 + assert 1 == 2, "validation failed" + ``` +- problem with assert - if we run it in optimized mode (`python3 -O test_example.py`), all the assert statements are ignored, and the code continues to execute normally + ```py + def say_hi(name): + assert name == "Colt", "I only say hi to Colt!" + return f"Hi, {name}!" + + print(say_hi("Charlie")) # Hi, Charlie! + ``` +- **doctests** - also improves readability of modules exposed to clients - + ```py + def add(a, b): + """ + >>> add(2,3) + 6 + + >>> add(2,"shameek") + Traceback (most recent call last): + ... + TypeError: unsupported operand type(s) for +: 'int' and 'str' + """ + return a + b + ``` +- to run doctests, use the following command - `python3 -m doctest -v test_example.py`. it will show that first test will fail, since expected is 6 but actual is 5 +- disadvantage - very finicky - even a simple whitespace can fail a perfect valid test +- **unit testing** - test small standalone components of classes, instead of testing interaction between different components / entire applications in one go +- assume we have the below file - + ```py + def eat(food, is_healthy): + reason = "it is good for me" if is_healthy else "you only live once" + return f"i am eating {food} because {reason}" + ``` +- we create a new test file, where we import the different functionalities and test it as follows + ```py + from test_example import eat + import unittest + + + class ActivitiesTest(unittest.TestCase): + + def test_eat_healthy(self): + self.assertEqual(eat("broccoli", True), "i am eating broccoli because it is good for me") + + def test_eat_unhealthy(self): + self.assertEqual(eat("pizza", False), "i am eating pizza because you only live once") + + + if __name__ == "__main__": + unittest.main() + ``` +- note - i think unittest looks for methods with prefix test +- we run the file containing tests like we would normally run a python file. if we add the verbose flag, the name of the tests being executed are also displayed + ```sh + python3 test_example_tests.py -v + ``` +- we also have other variations of assert like **true** / **false**, **in** / **not in**, **raises** (for asserting on type of error thrown) etc. e.g. below, we deal all the cards first, and then expect a value error to be thrown if we try dealing a card + ```py + # ... + def test__given_full_deck__when_5_cards_are_dealt__then_5_cards_are_returned(self): + self.deck.deal_hand(self.deck.count()) + + with self.assertRaisesRegex(ValueError, 'All cards have been dealt'): + self.deck.deal_card() + ``` +- **hooks** - run code before or after tests - creating database connections, adding fake data, etc. we need to override methods for this - + ```py + # ... + def setUp(self): + self.deck = Deck() + + def test__given_deck__when_count__then_52_is_returned(self): + self.assertEqual(self.deck.count(), 52) + + def tearDown(self): + pass + ``` + +## Web Scraping + +- programmatically download web pages, extract it and then use that data +- used when data from servers is not in the form of json +- as a best practice, we should refer the robots.txt of websites to see what paths they want to allow vs disallow scraping. e.g. refer [this](https://www.imdb.com/robots.txt) before scraping imdb. however, this is just a best practice, and nothing is stopping us from scraping publicly available websites +- the library used is **beautiful soup** - `python -m pip install bs4` +- we read from an html file and interact with the beautiful soup object + ```py + from bs4 import BeautifulSoup + + with open("mocked.html") as html_file: + html_content = html_file.read() + + soup = BeautifulSoup(html_content, "html.parser") + + print(soup.find("div")) #
bye
+ print(type(soup.find("div"))) # + ``` +- notice that while it prints the exact div when we use the print statement, it is not stored as a string, but a beautiful soup tag underneath +- i think using `find` returns the first match, while using `find_all` returns all matches. here, we see matching using id, class and a custom attribute + ```py + print(soup.find_all(class_="special")) # [
  • This list item is special.
  • ] + print(soup.find_all(id="first")) # [
    ] + print(soup.find_all(attrs={"data-example": "yes"})) # [

    hi

    ] + ``` +- we can use css selectors as well. my understanding - `select` works like `find_all`, `select_one` works like `find` + ```py + print(soup.select(".special")) # [
  • This list item is special.
  • ] + print(soup.select("#first")) # [
    ] + print(soup.select("[data-example='yes']")) # [

    hi

    ] + ``` +- understanding selectors more - to check if an attribute is "present", use one of the below - + ```py + print(soup.find_all(attrs={"data-example": True})) # [

    hi

    ] + print(soup.select("[data-example]")) # [

    hi

    ] + ``` +- getting the inner text of an element - + ```py + print(soup.select_one("#first").get_text()) + ``` +- accessing attributes like class, id, etc - `attrs`, which is a dict, has access to all of them + ```py + print(soup.select_one("#first").attrs["id"]) # first + print(soup.select_one("[data-example]").attrs) # {'data-example': 'yes'} + ``` +- **contents** - shows the contents of a tag. if we see carefully, it also considers new line as children + ```py + print(soup.body.contents) + # ['\n',
    , '\n',
      , '\n',
      bye
      , '\n'] + ``` +- we might need to navigate to siblings. remember the new lines we saw in the previous point, it is reflected in the example below + ```py + print(soup.select_one("#first").next_sibling) # <> + print(soup.select_one("#first").next_sibling.next_sibling) #
        ...
      + ``` +- this is why, the **find** variants might be better, since they ignore the new line characters. notice how we did not have tpo chain the find next sibling call twice this time around + ```py + print(soup.select_one("#first").find_next_sibling()) #
        ...
      + ``` +- find next sibling using a specific selector - + ```py + print(soup.select_one("#first").find_next_sibling(attrs={"data-example": True})) #
      bye
      + ``` +- till now, we were navigating to next sibling(s). similarly, we can do for previous sibling(s), parent, etc + +## Virtual Environment + +- helps maintain separate and isolated environments for python projects +- manage dependencies and different versions across projects without conflicts +- had to first install this for venv functionality - `sudo apt install python3.10-venv` +- command to create a virtual environment - `python3 -m venv env` +- we can name it anything, env is the convention +- activate the virtual environment - `source ./env/bin/activate`. the shell prompt is now prefixed with `(env)` +- to deactivate the virtual environment, simply run `deactivate` +- listing the installed packages - `pip ls` +- installing packages - `pip install requests` +- now, the idea is we typically save all the dependencies in a file like requirements.txt +- we commit this file to version control, for others to be able to use the exact same version +- generating requirements.txt - `pip freeze > requirements.txt` +- my understanding - the workflow for someone else starting afresh will look like this - + - clone the git repository + - create the virtual environment in the same folder + - activate the virtual environment + - run `pip install -r requirements.txt` diff --git a/_posts/2024-07-23-python-data-analysis.md b/_posts/2024-07-23-python-data-analysis.md new file mode 100644 index 0000000..25b26eb --- /dev/null +++ b/_posts/2024-07-23-python-data-analysis.md @@ -0,0 +1,2872 @@ +--- +title: Python Data Analysis +--- + +## Jupyter Notebooks + +- **jupyter** - web based environment for notebook documents +- allows to have python code along with headings, charts, tables, etc +- the entire flow - + - there is a notebook server running on our terminal bts + - what we do in the browser is communicated to this server to be executed + - finally, the output is returned to be displayed in the browser +- **anaconda** - automates setup of a lot of data science related libraries etc on our computer +- the steps i used to install anaconda on my linux pc is [here](https://docs.anaconda.com/anaconda/install/linux/) +- i also had to run `conda config --set auto_activate_base False` to prevent the environment from being activated on terminal startup +- activate the environment in shell using `source ~/anaconda3/bin/activate` +- to [install packages](https://docs.anaconda.com/working-with-conda/packages/install-packages/), use `conda install <>` +- to run the jupyter notebook, type `jupyter-notebook` in shell +- i am doing all of this from the folder where i want to store my notebooks +- when i created a new **notebook**, i could see that importing pandas and matplotlib was working by default without me installing anything +- execute a cell by using the run button / using shift + enter. it executes the current cell and + - inserts a new cell below if it is the last cell + - moves to the next cell below if it is not the last cell +- to only execute the current cell, use cmd + enter +- jupyter notebooks have autosave +- to shut down, use - file -> shutdown or cmd + c on the terminal where we ran jupyter-notebook +- all the code in a cell is executed, but the output is only shown for the last line in the cell +- there are two kinds of mode - **command mode** and **editing mode** + - go into command mode by clicking anywhere outside / hitting escape + - go into editing mode by clicking inside a cell / hitting enter when focus is on a cell +- actions like inserting cells etc can be done using shortcuts when in command mode +- we can navigate through cells using up and down arrow keys when in command mode +- use "a" insert a cell above, "b" to insert a cell below +- use "dd" to delete a cell +- undo cell operation e.g. undo deleting a cell using "z", redo a cell operation using shift + z +- a notebook can have different kinds of **cells** - e.g. code cells, markdown cells, etc. there is a dropdown that lets us select the type of the current cell +- we have a little restart icon which we can use to restart the kernel / stop button to interrupt the kernel. use case - e.g. if one cell ends up executing an infinite loop, we will not be able to execute any other cell +- we can run all the cells from top to bottom using run -> run all cells +- to see the documentation of something - + - `pd.read_csv` - ensure cursor is somewhere on read_csv and do shift + tab - this opens the documentation in a popup + - `pd.read_csv?` - this can be the last statement in a cell, and then the documentation is displayed in the output. the output cell might be bigger and easier to browse through than a popup + +## Dataframes and Datasets + +- textual data can be stored in csv, json, sql, etc +- **dataframes** - 2 dimensional data - rows and columns +- in jupyter, we can take a brief look at the data as seen by pandas using the following - + ```py + house_data = pd.read_csv("data/kc_house_data.csv") + house_data + ``` +- by default, pandas shows us the first 5 and last 5 rows, and first 10 and last 10 columns in jupyter. we can configure this however +- to view all the columns - `house_data.columns` +- to view the number of rows - `len(house_data)` +- to see the number of rows and columns in one go - + ```py + house_data.shape + # (21613, 21) + ``` +- we can construct a new dataframe using the for e.g. first x / last x rows of the existing dataframe + ```py + first_7 = house_data.head(7) + last_6 = house_data.tail(6) + ``` +- by default, pandas will assign data types using the following logic - if it is numeric, assign int if data has no decimals, else float. everything else is assigned to the object data type +- we can view these datatypes assigned by pandas using `info`. note it also shows the number of rows having null value for that column, the index information, the memory used, etc + ```py + house_data.info() + + # + # RangeIndex: 21613 entries, 0 to 21612 + # Data columns (total 21 columns): + # # Column Non-Null Count Dtype + # --- ------ -------------- ----- + # 0 id 21613 non-null int64 + # 1 date 21613 non-null object + # 2 price 21613 non-null float64 + # 3 bedrooms 21613 non-null int64 + # ... + # memory usage: 3.5+ MB + ``` +- specifying a separator manually if the separator used in csv is not comma - + ```py + netflix = pd.read_csv("data/netflix_titles.csv", sep="|") + netflix + ``` +- if we want to provide custom column names, we use `names`. in this case, the first row already had the headers but not in a format we liked, so we also pass 0 for the `header` attribute, so that pandas can skip the first row, and use the column names we provide + ```py + headers = ('sumlev', 'region', 'division', 'state', 'name') + + nst = pd.read_csv("data/nst-est2020.csv", names=headers, header=0) + nst + ``` +- finally, if we would like to use one of the existing columns as the index column, we can specify that as well using the index parameter + ```py + mount_everest_deaths = pd.read_csv("data/mount_everest_deaths.csv", index_col="No.") + mount_everest_deaths + ``` +- note - we can also pass in the column position instead of the column name for index col, like this - `index_col=0` + +## Basic Operations + +- finding the minimum / maximum - it returns a **series** data structure, and we get the min / max for every column when we perform this operation + ```py + house_data.min() + + # id 1000102 + # date 20140502T000000 + # price 75000.0 + # bedrooms 0 + # bathrooms 0.0 + ``` +- **sum** - sum all values. e.g. if a column has only holds a 1 or a 0, it gives us the number of values with a 1 for that attribute. for string like columns, it might concatenate them like strings. to prevent that, we also pass in the **numeric only** attribute + ```py + house_data.sum(numeric_only=True) + + # price 1.167293e+10 + # bedrooms 7.285400e+04 + # bathrooms 4.570625e+04 + # sqft_living 4.495287e+07 + # sqft_lot 3.265069e+08 + # floors 3.229650e+04 + ``` +- similarly, we also have **count** (gives the non null values for all columns), **mean**, **median** and **mode** +- **describe** - automatically gives a bunch of statistics around all numeric columns - + ```py + titanic.describe() + + # pclass survived sibsp parch + # count 1309.000000 1309.000000 1309.000000 1309.000000 + # mean 2.294882 0.381971 0.498854 0.385027 + # std 0.837836 0.486055 1.041658 0.865560 + # min 1.000000 0.000000 0.000000 0.000000 + # 25% 2.000000 0.000000 0.000000 0.000000 + # 50% 3.000000 0.000000 0.000000 0.000000 + # 75% 3.000000 1.000000 1.000000 0.000000 + # max 3.000000 1.000000 8.000000 9.000000 + ``` +- to get stats around non-numeric columns, we can set **include** to **object**. **top** gives the value that occurs the most number of times, while **freq** gives the number of times + ```py + titanic.describe(include = 'object') + + # name sex age ticket fare cabin embarked boat body home.dest + # count 1309 1309 1309 1309 1309 1309 1309 1309 1309 1309 + # unique 1307 2 99 929 282 187 4 28 122 370 + # top Connolly, Miss. Kate male ? CA. 2343 8.05 ? S ? ? ? + # freq 2 843 263 11 60 1014 914 823 1188 564 + ``` + +## Series and Columns + +- selecting a single column - + ```py + titanic["name"] + + # 0 Allen, Miss. Elisabeth Walton + # 1 Allison, Master. Hudson Trevor + # ... + # 1307 Zakarian, Mr. Ortin + # 1308 Zimmerman, Mr. Leo + ``` +- note - this is of type **pandas series** + ```py + type(titanic["name"]) + + # pandas.core.series.Series + ``` +- **series** - one dimensional array with labels +- for instance, i think when we select a column, the labels are the index column of the dataframe. e.g. if i index the dataframe using show id - + ```py + netflix_titles["type"] + + # show_id + # s1 Movie + # s2 TV Show + # ... + # s8806 Movie + # s8807 Movie + ``` +- when we call functions like **sum** for instance (refer [above](#basic-operations)) + - the value in the series is the sum for every column + - the labels are the column names +- functions like **describe** which returned a dataframe when called on a dataframe, will return a series when called on a series +- for functions like `sum` - + - when calling sum on a dataframe, we got a series + - when calling sum on a series, we will get a single value + + ```py + houses["price"].sum() # 11672925008.0 + ``` +- we can access the labels of a series using **index**, and the underlying values using **values** - + ```py + houses_min = houses.min() + houses_min + # id 1000102 + # date 20140502T000000 + # price 75000.0 + + houses_min.index + # Index(['id', 'date', 'price'], dtype='object') + + houses_min.values + # array([1000102, '20140502T000000', 75000.0], dtype=object) + ``` + +## Intermediate Operations + +- **unique** - give the unique value in a series. the return type of such methods is numpy array + ```py + houses["bedrooms"].unique() # array([ 3, 2, 4, 5, 1, 6, 7, 0, 8, 9, 11, 10, 33]) + type(houses["bedrooms"].unique()) # numpy.ndarray + ``` +- **nunique** - number of unique values. by default, **dropna** is True + ```py + netflix_titles["director"].nunique(), netflix_titles["director"].nunique(dropna=False) + # (4528, 4529) + ``` +- **nlargest** - n largest values. by default, n is 5 + ```py + houses['price'].nlargest(n=7) + + # 7252 7700000.0 + # 3914 7062500.0 + # 9254 6885000.0 + # 4411 5570000.0 + # 1448 5350000.0 + # 1315 5300000.0 + # 1164 5110800.0 + ``` +- caveat - handling duplicates - e.g. imagine class has 3 unique values - 1st class, 2nd class and 3rd class. when we call nlargest with n set to 709, we get 709 values, each with value 3. when we call it with 710, we get 709 values for 3, and 1 value for 2. but what if we wanted all values for the last value that comes when using nlargest? we can set the **keep** parameter. when keep is **all**, we get 986 total values, even though n was 710. other possible values for keep are **first** (default) and **last** (probably the last row with the value as 2nd class would be returned in this case?) + ```py + len(titanic['pclass'].nlargest(709)), len(titanic['pclass'].nlargest(709, keep='all')) # (709, 709) + len(titanic['pclass'].nlargest(710)), len(titanic['pclass'].nlargest(710, keep='all')) # (710, 986) + ``` +- similarly, we can call it on dataframes as well - we will need to specify the column names as well this time around though - + ```py + houses.nlargest(5, "price") + + # id date price bedrooms + # 7252 6762700020 20141013T000000 7700000.0 6 + # 3914 9808700762 20140611T000000 7062500.0 5 + # 9254 9208900037 20140919T000000 6885000.0 6 + ``` +- we access a single column like this - `netflix_titles["title"]`. to access multiple columns, we can use the following syntax. note that even though we just pass one parameter, what we get back is still a dataframe, and not a series like we would get when using `netflix_titles["title"]`. note - remember that this creates a new dataframe + ```py + netflix_titles[["title"]] + + # title + # show_id + # s1 Dick Johnson Is Dead + # s2 Blood & Water + # s3 Ganglands + + houses[["bedrooms", "bathrooms"]] + + # bedrooms bathrooms + # 0 3 1.00 + # 1 3 2.25 + # 2 2 1.00 + ``` +- **value counts** - counts of unique values. sorts in descending order of counts by default. we can use the **ascending** parameter to sort it in ascending order of counts + ```py + houses["bedrooms"].value_counts() + + # 3 9824 + # 4 6882 + # 2 2760 + ``` +- we can also have **value counts** for a **dataframe**. if we do it for all columns, we might end up having 1 value per row, as any two rows having same values for all columns is rare. we would typically perform this on a subset of columns like below. note - we still get back a series - it feels like the **label** of the series is comprised of multiple attributes, but it is still a pandas series and not a dataframe + ```py + houses[["bedrooms", "bathrooms"]].value_counts() + + # bedrooms bathrooms + # 4 2.50 2502 + # 3 2.50 2357 + # 2 1.00 1558 + ``` +- small note - there are multiple ways of doing a thing, maybe we should try being efficient. i make these mistakes frequently - using the first value in the output of value counts instead of using mode directly, etc - + - `sort_values("Attack").head(20)` vs `nsmallest(20, "Attack")` + - `.value_counts().head(1).index[0]` vs `mode` + +## Plotting Basics + +- in case of a **series**, we plot **values** against **labels**. if i try to for e.g. do `houses["bedrooms"].plot()`, it would not make much sense, since we would be plotting number of bedrooms against an index that might be like a house identifier +- so, we can for e.g. plot value counts of bedrooms - this way, we would be plotting number of houses with the given number of bedrooms against number of bedrooms - as we see below, 3 bedrooms are the most common + ```py + houses['bedrooms'].value_counts().plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/plotting-basics-bedroom-value-counts.png) +- above, we tried plotting a pandas **series**. we can also plot **dataframes** +- e.g. try looking at the general distribution between bedrooms and bathrooms, by plotting one against another. we will have to customize both the x and y axis in this case, otherwise again, we might end up plotting all attributes against the autogenerated index + ```py + houses.plot(x="bedrooms", y="bathrooms", kind="scatter") + ``` + ![](/assets/img/python-data-analysis/houses-bedrooms-vs-bathrooms-scatter-plot.png) + +## Index + +- both **dataframes** and **series** in pandas have **labels** / **indices** +- by default, a **range index** is used - auto incrementing index that goes 0, 1, 2, and so on +- when we select a column in a dataframe, the labels used for the series is the same as the one used for the original dataframe +- e.g. if we have a csv containing the stock related data for a particular stock, we can set the index column to be date, to easily get the low and high price for a particular date. we can set the index to a column manually by calling **set index**. note - like most things, this too returns a new dataframe instead of mutating the original dataframe + ```py + bitcoin["High"] + # 0 147.488007 + # 1 146.929993 + # 2 139.889999 + + + bitcoin = bitcoin.set_index("Date") + # High Low Open Close + # Date + # 2013-04-29 23:59:59 147.488007 134.000000 134.444000 144.539993 + # 2013-04-30 23:59:59 146.929993 134.050003 144.000000 139.000000 + # 2013-05-01 23:59:59 139.889999 107.720001 139.000000 116.989998 + + bitcoin["High"] + # Date + # 2013-04-29 23:59:59 147.488007 + # 2013-04-30 23:59:59 146.929993 + # 2013-05-01 23:59:59 139.889999 + ``` +- if for e.g. we were to call `bitcoin["High"].plot()` after setting the index, the plot would make a lot more sense - high price against date, so how the price of bitcoin changed over days / years. without the re-indexing, it would display the price of bitcoin against an auto-incrementing integer, which would not have made much sense +- we can also do it when reading the csv using the **index col** parameter as seen [earlier](#dataframes-and-datasets) + ```py + happiness_indexed = pd.read_csv("data/world-happiness-report-2021.csv", index_col="Country name") + happiness_indexed + + # Healthy life expectancy Freedom to make life choices + # Country name + # Finland 72.000 0.949 + # Denmark 72.700 0.946 + # Switzerland 74.400 0.919 + ``` + +## Sorting + +- **sort values** - it is present both in series and dataframe. the default sort order is ascending. but, it is not in place. with most commands, i was reassigning the actual variable itself. there is another way to achieve this though when using these functions - passing in true for the **in place** argument + ```py + happiness_indexed.sort_values("Healthy life expectancy", ascending=False, inplace=True) + + # Healthy life expectancy Freedom to make life choices + # Country name + # Singapore 76.953 0.927 + # Hong Kong 76.820 0.717 + # Japan 75.100 0.796 + ``` +- sorting by multiple columns - descending by number of bedrooms, ascending by number of bathrooms - + ```py + houses = pd.read_csv("data/kc_house_data.csv") + houses.sort_values(["bedrooms", "bathrooms"], ascending=[False, True]) + + # bedrooms bathrooms + # 15870 33 1.75 + # 8757 11 3.00 + # 15161 10 2.00 + ``` +- when sorting by a text column, e.g. name, the sorting will use the ascii value, so `Arjun` comes before `abhishek`. we can use the **key** function in pandas to provide a custom lambda to use when sorting rows - + ```py + titanic.sort_values("name", inplace=True, key=lambda name: name.str.lower()) + ``` +- **sort index** - helps sort the data by the index / labels - + ```py + happiness.sort_index(inplace=True) + ``` +- we can call **sort values** / **sort index** on pandas series as well - the difference here is providing the column would not be required when sorting by values +- by default, value counts sorts on frequency. however, this might not make sense when we try to plot it - houses with 3 bedrooms would appear before houses with 1 bedroom on the x axis. so, we sort by the number of bedrooms i.e. the **index** + ```py + bedrooms_stats = houses["bedrooms"].value_counts() + bedrooms_stats + + # bedrooms + # 3 9824 + # 4 6882 + # 2 2760 + # 5 1601 + # 6 272 + # 1 199 + + bedrooms_stats.sort_index(inplace=True) + bedrooms_stats.plot(kind="bar") + ``` + ![](/assets/img/python-data-analysis/bedroom_sort_index_example.png) + +## Indexing + +- we already tried accessing data using columns. [using one column](#series-and-columns) gives us a pandas series, while [using multiple columns](#intermediate-operations) gives a dataframe. we only need a pair of square braces for accessing columns +- to access particular rows, we can use **loc** / **iloc** +- e.g. our data is indexed using country name. we can access the data for a particular country using loc. output format i believe is a series, where the labels are the column names + ```py + happiness + # Healthy life expectancy Freedom to make life choices + # Country name + # Afghanistan 52.493 0.382 + # Albania 68.999 0.785 + # Algeria 66.005 0.480 + # Argentina 69.000 0.828 + + happiness.loc["Yemen"] + # Healthy life expectancy 57.122 + # Freedom to make life choices 0.602 + ``` +- just like when accessing columns, if we use an additional pair of square braces, we start getting a dataframe instead + ```py + happiness.loc[["Yemen"]] + + # Healthy life expectancy Freedom to make life choices + # Country name + # Yemen 57.122 0.602 + ``` +- we can also use **slicing** with **loc** - e.g. get all the rows between denmark to france. note - remember to sort using index first for this to work properly + ```py + happiness.sort_index(inplace=True) + happiness.loc["Denmark" : "France"] + + # Healthy life expectancy Freedom to make life choices + # Country name + # Denmark 72.700 0.946 + # Ecuador 68.800 0.842 + # Egypt 61.998 0.749 + # Finland 72.000 0.949 + # France 74.000 0.822 + ``` +- **iloc** - access rows using **integer position-based indexing** +- e.g. i want the 20th country alphabetically - i may not know what it is. i can however access it using iloc. again, i get back a series + ```py + happiness.iloc[19] + + # Healthy life expectancy 62.000 + # Freedom to make life choices 0.959 + ``` +- e.g. if i wanted the 1st 3rd and 5th countries, i add an additional pair of square braces, and again, get back a dataframe this time around - + ```py + happiness.iloc[[0, 2, 4]] + + # Healthy life expectancy Freedom to make life choices + # Country name + # Afghanistan 52.493 0.382 + # Algeria 66.005 0.480 + # Armenia 67.055 0.825 + ``` +- finally, again with iloc, we can also use slicing. we will use integer positions, where we can specify start, end and optionally a step + ```py + happiness.iloc[0:5] + ``` +- so, loc uses values of index, iloc uses numeric position +- again, we can use both **loc** and **iloc** on **series** as well + +## Filtering + +- carefully look at the three steps we follow below for **filtering** - we can use a column to get a series, we generate a boolean series from it by using conditions, and finally we get the rows that hold true for the corresponding position in the boolean series + ```py + df + + # name sex age + # 0 Allen, Miss. Elisabeth Walton female 29 + # 1 Allison, Master. Hudson Trevor male 0.9167 + # 2 Allison, Miss. Helen Loraine female 2 + + + df['sex'] + + # 0 female + # 1 male + # 2 female + + + df['sex'] == 'female' + + # 0 True + # 1 False + # 2 True + + + df[df['sex'] == 'female'] + + # name sex age + # 0 Allen, Miss. Elisabeth Walton female 29 + # 2 Allison, Miss. Helen Loraine female 2 + ``` +- we saw `==` above. we can also use the other **comparison** operators like `!=`, `>=`, `>`, `<=`, `<`, etc + ```py + houses[houses['price'] > 5000000] + + # id date price bedrooms bathrooms + # 1164 1247600105 20141020T000000 5110800.0 5 5.25 + # 3914 9808700762 20140611T000000 7062500.0 5 4.50 + # 7252 6762700020 20141013T000000 7700000.0 6 8.00 + # 9254 9208900037 20140919T000000 6885000.0 6 7.75 + ``` +- series have a method called **between** which we can use. e.g. find houses with bedrooms in the range 5 to 7 - + ```py + houses[houses['bedrooms'].between(5, 7)].value_counts('bedrooms') + + # bedrooms + # 5 1601 + # 6 272 + # 7 38 + ``` +- we can use **isin**, e.g. find netflix movies in india or south korea - + ```py + netflix[netflix['country'].isin(['India', 'South Korea'])].value_counts('country') + + # country + # India 972 + # South Korea 199 + ``` +- we can combine conditions using boolean operators - + ```py + women = titanic['sex'] == 'female' + died = titanic['survived'] == 0 + titanic[women & died] + + # sex survived pclass cabin + # 2 female 0 1 C22 C26 + # 4 female 0 1 C22 C26 + # 105 female 0 1 A29 + ``` +- note - doing it in one line - do not forget parentheses, otherwise python cannot parse it correctly due to priority - + ```py + titanic[(titanic['sex'] == 'female') & (titanic['survived'] == 0)] + ``` +- similarly, we can use `|` for or, `~` for negation +- **isna** - returns true for rows where the column is missing a value - + ```py + netflix[netflix['director'].isna()] + + # show_id type title director + # 1 s2 TV Show Blood & Water NaN + # 3 s4 TV Show Jailbirds New Orleans NaN + # 4 s5 TV Show Kota Factory NaN + # 10 s11 TV Show Vendetta: Truth, Lies and The Mafia NaN + # 14 s15 TV Show Crime Stories: India Detectives NaN + ``` +- my understanding - everywhere above, we are trying to filter using a column value. we can use the index as well though - recall - we saw in [series](#series-and-columns) that we can access the labels using **index**. my understanding - the point is, whatever we did using `dataframe[column]`, can be done using `dataframe.index` as well - + ```py + countries[countries.index != 'Denmark'] + ``` + +## Modifying Columns and Indices + +- **dropping columns** - we use the **drop** method. we need to specify the columns to drop, and the **axis**. the same drop method can be used to drop rows as well, hence we need to specify the axis. axis can be set to - + - 0 / **index** to drop rows + - 1 / **columns** to drop columns + + ```py + bitcoin.drop(labels=['Name', 'Symbol'], axis='columns') + ``` +- another way to do this is to just pass in the **columns** parameter directly, instead of passing in **labels** and **axis** + ```py + bitcoin.drop(columns=['Name', 'Symbol']) + ``` +- till now, we saw dropping columns. we can also **drop rows** using one of the following ways - + ```py + # method 1 + countries.drop(labels=['Denmark', 'Finland', 'Iceland'], axis='index') + + # method 2 - shorthand and my favorite of the three + countries.drop(index=['Denmark', 'Finland', 'Iceland']) + + # method 3 - it is the first positional argument + # so we can skip providing the "index" kwarg + countries.drop(['Denmark', 'Finland', 'Iceland']) + ``` +- drop all countries except the first 10. we can pass an index series as well + ```py + countries.drop(countries.index[10:]) + ``` +- creating a new column with a **constant value** - + ```py + titanic['constant'] = 'something' + ``` +- creating a new column with **dynamic values** - + ```py + # number of relatives = number of parents and children + number of siblings and spouses + titanic["relatives"] = titanic["parch"] + titanic["sibsp"] + ``` +- **renaming columns** - i think the arguments are similar to **drop** - instead of **labels** and **axis**, we pass **mapper** and **axis** + ```py + mapper = { 'Regional indicator': 'regional_indicator', 'Ladder score': 'ladder_score' } + + countries.rename(mapper=mapper, axis='columns') + countries.rename(columns=mapper) + ``` +- similarly, we can **rename indices** - + ```py + mapper = { 'Netherlands': 'the_netherlands' } + + countries.rename(mapper=mapper, axis='index') + countries.rename(index=mapper) + countries.rename(mapper) + ``` +- a complex problem - find the show called "Evil", and change its index label to s6666 inplace + ```py + evil_index = netflix[netflix['title'] == 'Evil'].index[0] + netflix.rename(index={ evil_index: 's6666' }, inplace=True) + ``` + +## Updating Values + +- my understanding - we have already seen tricks to change column names / index names using **rename**. now we look at **replace** - the way of renaming the actual values inside +- we can use **replace** as follows. again, it is not **in place** by default, so we need to pass true for in place explicitly + ```py + titanic.replace({'sex': { 'female': 'F', 'male': 'M' }}, inplace=True) + ``` +- this method is supported for **series** as well and not just **dataframes**. so, we can use the technique below as well - + ```py + titanic['sex'] = titanic['sex'].replace({ 'female': 'F', 'male': 'M' }) + # OR + titanic['sex'].replace({ 'female': 'F', 'male': 'M' }, inplace=True) + ``` +- in the titanic dataset, all unknown values in the age column hold `?`. we can replace them with **none**. note the use of **dropna** in **value counts**, i do not think this was mentioned earlier + ```py + titanic.value_counts('age') + # age + # ? 263 + # 24 47 + # ... + + titanic.replace({ 'age': { '?': None } }, inplace=True) + titanic.value_counts('age', dropna=False) + # age + # NaN 263 + # 24 47 + # ... + ``` +- we can use **replace** when all values that match "x" in a column "a" need to be replaced +- but imagine i want to replace values in a column "a", but only for specific rows - + - we know the [indices](#indexing) of the rows + - we have a [filtering condition](#filtering) to filter the desired rows +- we can use **loc** for both use cases above + ```py + countries.loc[['Denmark', 'Sweden', 'Norway'], ['Regional indicator']] = 'Scandinavia' + ``` +- setting multiple columns to a single value - + ```py + countries.loc[['Finland', 'Denmark'], ['upperwhisker', 'lowerwhisker']] = 4.5 + ``` +- setting multiple columns, each to its own specific value - + ```py + countries.loc[['Finland', 'Denmark'], ['upperwhisker', 'lowerwhisker']] = [4.5, 2.8] + ``` +- till now, even in [here](#indexing), we have tied accessing rows whose indices we know +- however, loc can be passed the boolean pandas series we saw in [filtering](#filtering) as well - + ```py + houses.loc[houses['bedrooms'] >= 10] + # id date price bedrooms bathrooms + # 8757 1773100755 20140821T000000 520000.0 11 3.00 + # 13314 627300145 20140814T000000 1148000.0 10 5.25 + # 15161 5566100170 20141029T000000 650000.0 10 2.00 + # 15870 2402100895 20140625T000000 640000.0 33 1.75 + ``` +- advantage of the above - we can now conditionally update certain rows. we have already seen how to update rows using loc, and we know how to filter rows based on conditions + ```py + houses.loc[houses['bedrooms'] >= 10, ['bedrooms']] = 9999 + houses.loc[houses['bedrooms'] == 9999] + # id date price bedrooms bathrooms + # 8757 1773100755 20140821T000000 520000.0 9999 3.00 + # 13314 627300145 20140814T000000 1148000.0 9999 5.25 + # 15161 5566100170 20141029T000000 650000.0 9999 2.00 + # 15870 2402100895 20140625T000000 640000.0 9999 1.75 + ``` +- a complex problem - add a new column 'luxurious' - set it to 'yes' for houses with grade > 12 and view = 4, and set it to 'no' for others - + ```py + good_view = houses['view'] == 4 + good_quality = houses['grade'] > 12 + houses[good_view & good_quality] + # price view grade + # 9254 6885000.0 4 13 + # 14556 2888000.0 4 13 + # 19017 3800000.0 4 13 + + houses['luxurious'] = 'no' + houses.loc[good_view & good_quality, ['luxurious']] = 'yes' + + houses[houses['luxurious'] == 'yes'] + # price view grade luxurious + # 9254 6885000.0 4 13 yes + # 14556 2888000.0 4 13 yes + # 19017 3800000.0 4 13 yes + ``` + +## Data Types + +- we have a dataset that holds `?` for columns for missing values in the csv + ```py + titanic.info() + # # Column Non-Null Count Dtype + #--- ------ -------------- ----- + # 0 pclass 1309 non-null int64 + # 1 survived 1309 non-null int64 + # 2 name 1309 non-null object + # 4 age 1309 non-null object + + titanic['age'].value_counts() + # age + # ? 263 + # 24 47 + # 22 43 + # 21 41 + ``` +- issue - we cannot do things like finding the mean age +- solution - we convert the data type of the age column. first, we **replace** `?` with **none**, then we **cast** it to type float - + ```py + titanic['age'] = titanic['age'].astype('float') + # ValueError: could not convert string to float: '?' + + titanic['age'] = titanic.replace({ 'age': { '?' : None } }).astype('float') + + titanic.info() + # # Column Non-Null Count Dtype + #--- ------ -------------- ----- + # 0 pclass 1309 non-null int64 + # 1 survived 1309 non-null int64 + # 2 name 1309 non-null object + # 4 age 1046 non-null float64 + + titanic['age'].value_counts(dropna=False) + # age + # NaN 263 + # 24.0000 47 + # 22.0000 43 + ``` +- now, we can use numeric functions like for e.g. `titanic['age'].mean()` +- another option - **to numeric** - it is a more aggressive alternative to the one we saw earlier. earlier, we manually ran replace for all the question marks to be replaced by none, and then did the type conversion from **object** to **float**. now, with the below approach, we will say try converting it to numeric, and if you cannot, just put a none in there. the default value of **errors** is **raise** i.e. raise an exception when you encounter an error. we typically change it to **coerce** for getting the behavior we described + ```py + titanic['body'] = pd.to_numeric(titanic['body'], errors='coerce') + ``` +- **category** type - useful when a column has a set of **finite** possible values, e.g. gender +- advantage - less memory usage etc +- by default, this is the situation. specially look at the **dtype** column and **memory usage** in the output + ```py + titanic['sex'].value_counts() + # sex + # male 843 + # female 466 + + titanic.info() + # # Column Non-Null Count Dtype + # --- ------ -------------- ----- + # 3 sex 1309 non-null object + # memory usage: 143.3+ KB + ``` +- when we manually cast gender to type of **category**, the output looks like follows. look how the type is now changed and the memory usage too has reduced + ```py + titanic['sex'] = titanic['sex'].astype('category') + + titanic['sex'].value_counts() + # sex + # male 843 + # female 466 + + titanic.info() + # # Column Non-Null Count Dtype + # --- ------ -------------- ----- + # 3 sex 1309 non-null category + # memory usage: 134.5+ KB + + titanic['sex'] + # 0 female + # 1 male + # ... + # 1307 male + # 1308 male + # Name: sex, Length: 1309, dtype: category + # Categories (2, object): ['female', 'male'] + ``` + +## NA Values + +- **is na** - returns true for cells that do not contain a value. can be called on both the dataframe and series. look at the last statement, where we generate a boolean series that represent all rows which contain null for league, and then use it as a filter condition + ```py + game_stats = pd.read_csv('data/game_stats.csv') + game_stats + # name league points assists rebounds + # 0 bob nba 22.0 5.0 10.0 + # 1 jessie NaN 10.0 NaN 2.0 + # 2 stu euroleague NaN NaN NaN + # 3 jackson aba 9.0 NaN 2.0 + # 4 timothee NaN 8.0 NaN NaN + # 5 steph nba 49.0 8.0 10.0 + # 6 NaN NaN NaN NaN NaN + + game_stats.isna() + # name league points assists rebounds + # 0 False False False False False + # 1 False True False True False + # 2 False False True True True + # 3 False False False True False + # 4 False True False True True + # 5 False False False False False + # 6 True True True True True + + game_stats['league'].isna() + # 0 False + # 1 True + # 2 False + # 3 False + # 4 True + # 5 False + # 6 True + + game_stats[game_stats['league'].isna()] + # name league points assists rebounds + # 1 jessie NaN 10.0 NaN 2.0 + # 4 timothee NaN 8.0 NaN NaN + # 6 NaN NaN NaN NaN NaN + ``` +- **drop na** - dropping rows with missing values. it too creates a new copy unless we specify **in place** explicitly +- to drop rows where any of the columns hold null, specify the **how** parameter as **any**. note - this is also the default i.e. when we call **drop na** without any parameters + ```py + game_stats.dropna(how='any') + + # name league points assists rebounds + # 0 bob nba 22.0 5.0 10.0 + # 5 steph nba 49.0 8.0 10.0 + ``` +- drop only rows where all the columns hold null - specify the **how** parameter as **all** + ```py + game_stats.dropna(how='all') + + # name league points assists rebounds + # 0 bob nba 22.0 5.0 10.0 + # 1 jessie NaN 10.0 NaN 2.0 + # 2 stu euroleague NaN NaN NaN + # 3 jackson aba 9.0 NaN 2.0 + # 4 timothee NaN 8.0 NaN NaN + # 5 steph nba 49.0 8.0 10.0 + ``` +- drop rows where any of the specified columns are not present + ```py + game_stats.dropna(subset=['points', 'rebounds']) + + # name league points assists rebounds + # 0 bob nba 22.0 5.0 10.0 + # 1 jessie NaN 10.0 NaN 2.0 + # 3 jackson aba 9.0 NaN 2.0 + # 5 steph nba 49.0 8.0 10.0 + ``` +- finally, we can drop columns as well by setting the **axis** parameter, e.g. drop all columns where any of the rows contain missing values for it + ```py + netflix.dropna(how='any', axis=1) + ``` +- note - **drop na** works for **series** as well +- we can use **fill na** to fill the cells missing values with a particular value +- if we call it directly with a value, it would apply to all columns - + ```py + game_stats.fillna(0) + + # name league points assists rebounds + # 0 bob nba 22.0 5.0 10.0 + # 1 jessie 0 10.0 0.0 2.0 + # 2 stu euroleague 0.0 0.0 0.0 + # 3 jackson aba 9.0 0.0 2.0 + # 4 timothee 0 8.0 0.0 0.0 + # 5 steph nba 49.0 8.0 10.0 + # 6 0 0 0.0 0.0 0.0 + ``` +- we can however specify specific columns like so - + ```py + game_stats.fillna({ 'points': 10.0, 'assists': 0 }) + + # name league points assists rebounds + # 0 bob nba 22.0 5.0 10.0 + # 1 jessie NaN 10.0 0.0 2.0 + # 2 stu euroleague NaN 0.0 NaN + # 3 jackson aba 9.0 0.0 2.0 + # 4 timothee NaN 8.0 0.0 NaN + # 5 steph nba 49.0 8.0 10.0 + # 6 anonymous NaN NaN 0.0 NaN + ``` +- fun exercise - instead of using **fill na**, use [**loc**](#updating-values) for updating values where it is missing + ```py + netflix.loc[netflix['rating'].isna(), 'rating'] = 'TV-MA' + netflix.fillna({ 'rating': 'TV-MA' }) + ``` +- assume we have two columns in a sales table for shipping and billing addresses. we would like to default the shipping address to the billing address wherever shipping address is missing. we can do it as follows - + ```py + sales.fillna({ 'shipping_zip': sales['billing_zip'] }, inplace=True) + ``` +- my understanding - based on above point, we can specify a **series** as well for the value, and it will fill using the corresponding value in the series wherever a null is encountered + +## Dates and Times + +- dates an be present in lots different formats - months can be in words or numbers, days can come before months or the other way around, separator can be - or /, etc +- my understanding - the **to datetime** function does a (mostly) great job at auto detecting the date time format + ```py + pd.to_datetime('31 Dec. 2019') # Timestamp('2019-12-31 00:00:00') + pd.to_datetime('12/31/2019') # Timestamp('2019-12-31 00:00:00') + ``` +- we can however pass it parameters to configure its behavior as well in case of ambiguity +- e.g. look below how we use **day first** and **year first** to get different outputs for the same input - + ```py + pd.to_datetime('10-11-12') # Timestamp('2012-10-11 00:00:00') + pd.to_datetime('10-11-12', dayfirst=True) # Timestamp('2012-11-10 00:00:00') + pd.to_datetime('10-11-12', yearfirst=True, dayfirst=True) # Timestamp('2010-12-11 00:00:00') + ``` +- we can use the more powerful **format** as well. [format codes reference](https://docs.python.org/3/library/datetime.html#format-codes) + ```py + pd.to_datetime('10-11-12', format='%y-%d-%m') # Timestamp('2010-12-11 00:00:00') + + meetings = ['Dec 11 2019 Meeting', 'Jan 15 2024 Meeting', 'Mar 7 2024 Meeting'] + pd.to_datetime(meetings, format='%b %d %Y Meeting') + # DatetimeIndex(['2019-12-11', '2024-01-15', '2024-03-07'], dtype='datetime64[ns]', freq=None) + ``` +- python's default behavior - try parsing it like a numeric column if possible, else change to object. so, converting a dataframe column to datetime format using **astype** - + ```py + ufos.info() + # ... + # 1 city 87888 non-null object + # 2 state 82890 non-null object + # 3 date_time 86938 non-null object + + ufos['date_time'] = pd.to_datetime(ufos['date_time']) + + ufos.info() + # ... + # 1 city 87888 non-null object + # 2 state 82890 non-null object + # 3 date_time 86938 non-null datetime64[ns] + ``` +- we can also specify the datetime columns upfront while reading a csv, instead of converting it later - + ```py + ufos = pd.read_csv('data/nuforc_reports.csv', parse_dates=['date_time']) + ufos.info() + # ... + # 1 city 87888 non-null object + # 2 state 82890 non-null object + # 3 date_time 86938 non-null datetime64[ns] + ``` +- there are also keyword arguments for specifying the datetime format etc in the read csv call, refer documentation +- we can access the **date time properties** object on the column of type datetime64. we access it using **dt**. [full list of properties available](https://pandas.pydata.org/docs/user_guide/timeseries.html#time-date-components) +- e.g. view the top 10 years with the most ufo sightings. we first need to extract just the year from the datetime column, and then, we can chain **value counts** with **nlargest** to get the top 10 years + ```py + ufos['date_time'].dt.year.value_counts().nlargest(10).plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/value-counts-ufo-sightings-by-year-datetime.png) +- **comparing datetime columns** + - notice how we provide strings and pandas can parse it for us automatically. the example below will give us all the ufo sightings since 12am on 22 december, 2019 + ```py + ufos[ufos['date_time'] > '2019-12-22'] + ``` + - we already saw how we can access properties on a datetime column. we can use it to perform filtering as well. the example below will give us all sightings where the hour was 2 .i.e. it could have happened at 2.30am, 2.49am etc + ```py + ufos[ufos['date_time'].dt.hour == 2.0] + ``` +- **time deltas** - we get this when we subtract two datetime objects. e.g. get the people who waited the longest to report after seeing a ufo + ```py + ufos['posted'] - ufos['date_time'] + # 0 9 days 05:17:00 + # 1 6 days 05:30:00 + # ... + # 88123 1 days 02:00:00 + # 88124 1 days 02:00:00 + # Length: 88125, dtype: timedelta64[ns] + + (ufos['posted'] - ufos['date_time']).nlargest(5) + # 86762 18463 days 00:00:00 + # 87145 18353 days 22:30:00 + # 721 18314 days 03:00:00 + # 1576 18287 days 00:00:00 + # 1580 18240 days 08:00:00 + # dtype: timedelta64[ns] + ``` +- just like in datetime, we can also access properties of time deltas. [full list here](https://pandas.pydata.org/docs/user_guide/timedeltas.html#attributes) +- a complex example - + - find the homes sold between may 1st 2014 and may 1st 2015 + - create a bar plot showing the total number of sales per month in that period + - the x axis should be in calendar order (1-12) +- we filter to get all houses sold in the time period. then, we extract the month and perform value counts on it. finally, we sort by index i.e. by months since by default, value counts will sort by counts. finally, we plot it + ```py + houses_sold = houses[houses['date'].between('05-01-2014', '05-01-2015')] + houses_sold['date'].dt.month.value_counts().sort_index().plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/houses-sold-in-period-monthwise.png) +- i wanted to get the **week of year**. i could not access any such property on dt. so, i instead tried to format the date into a string. then, we can use the [format codes](https://docs.python.org/3/library/datetime.html#format-codes) we mentioned earlier as well +- e.g. create a line plot showing the total number of sales by week of the year number (1-52) +- first we obtain the week number the house was sold in. then, we obtain the value counts for each of the week. then we sort by the index i.e. the week number, because otherwise the x axis of the line plot will not be sorted and look weird - recall that value counts will sort by counts and not index + ```py + plt.figure(figsize=(15, 5)) + plt.xticks(range(52)) + houses['date'].dt.strftime('%V').value_counts().sort_index().plot(kind='line') + ``` + ![](/assets/img/python-data-analysis/houses-sold-by-week-datetime-line.png) + +## Matplotlib + +- till now, whenever we called **plot** on pandas series, it was actually calling matplotlib bts +- however, it can have limitations, which is when we might want to interact with matplotlib +- most common way of importing matplotlib - + ```py + import matplotlib.pyplot as plt + ``` +- when we do the following, it defaults values on the x axes to be 0, 1, 2, ... + ```py + plt.plot([2, 6, 2, 4, 8]) + ``` + ![](/assets/img/python-data-analysis/matplotlib-very-basic.png) +- we can specify values for both x and y as follows - + ```py + salaries = [20000, 50000, 60000, 100000, 250000, 150000] + ages = [20, 25, 30, 32, 45, 65] + plt.plot(ages, salaries) + ``` + ![](/assets/img/python-data-analysis/matplotlib-basics-specifying-both-axes.png) +- note - when we call **plot** in jupyter notebook, before the actual graph, we see a line like this - `[]`. this is actually the output of plot, but jupyter is smart enough to show us the output at the back of it as well. we will have to call `plt.show()` if we are not working on a jupyter notebook +- matplotlib terminology - the top level container is a **figure** +- a figure can have multiple **axes** +- each axes is a combination of **labels**, **data**, etc +- assume we have the following sample data - + ```py + nums = range(6) + nums_squared = [num**2 for num in nums] + nums_cubed = [num**3 for num in nums] + ``` +- when we have the following code, all of them are plotted on the **same figure** and the **same axes** + ```py + plt.plot(nums) + plt.plot(nums_squared) + plt.plot(nums_cubed) + ``` + ![](/assets/img/python-data-analysis/same-figure-same-axes.png) +- we call **figure** to create a new figure and make it the current figure. so when we call **plot**, it basically plots on the current active figure. so, with the code below, all of them are plotted on **different figures** + ```py + plt.figure(figsize=(4,3)) + plt.plot(nums) + + plt.figure(figsize=(4,3)) + plt.plot(nums_squared) + + plt.figure(figsize=(4,3)) + plt.plot(nums_cubed) + ``` + ![](/assets/img/python-data-analysis/different-figure-i.png) + ![](/assets/img/python-data-analysis/different-figure-ii.png) + ![](/assets/img/python-data-analysis/different-figure-iii.png) +- note how we control the size of a figure in matplotlib - we can pass **figsize** and **dpi** or **dots per inch** to figure. i usually just touch figsize, which defaults to 6.4, 4.8 +- we can specify the linestyle when plotting as follows. notice the shorthand at the third call as well + ```py + plt.plot(nums, nums, linestyle='dashed') + plt.plot(nums, nums_squared, linestyle='dotted') + plt.plot(nums, nums_cubed, linestyle='-.') + ``` + ![](/assets/img/python-data-analysis/plot-line-styles.png) +- when calling plot, we can specify parameters like **color**, **linewidth**, etc as well if needed +- we can also specify **markers** and their styling + ```py + plt.plot(nums, nums_cubed, marker='o') + ``` + ![](/assets/img/python-data-analysis/plot-with-markers.png) +- we can use **title** to set a title for the **axes**, and **labels** to set labels for the x and y axis individually + ```py + plt.plot(nums, nums_squared) + plt.title("Squares of Numbers") + plt.xlabel("Input") + plt.ylabel("Squares") + ``` + ![](/assets/img/python-data-analysis/plot-with-title.png) +- remember - all these methods we see - plot, title, xlabel and ylabel, and others that we see later - also accept a ton of options to control the size, spacing, color, positioning, etc. refer documentation as and when needed +- when we try to plot the below, look at the default graph. notice how on x axis for e.g., matplotlib itself decided that it should start the **ticks** from 3 etc + ```py + nums = [3, 3.5, 4, 7, 9] + nums_squared = [num**2 for num in nums] + plt.plot(nums, nums_squared, marker='o') + ``` + ![](/assets/img/python-data-analysis/default-without-ticks.png) +- option 1 - we can add custom ticks using **xticks** and **yticks**. it serves two purposes - + - we can only provide the first argument. this controls what ticks should show up + - we can provide the second argument as well. this controls what the actual tick should be named inside the graph + + ```py + plt.plot(nums, nums_squared, marker='o') + plt.xticks([1, 2, 3, 4, 7, 8, 9], ['one', 'two', 'three', 'four', 'seven', 'eight', 'nine']) + ``` + ![](/assets/img/python-data-analysis/custom-ticks.png) +- option 2 - we can only modify the **limits**. e.g. we would like the x axis start from -2 and end at 20 for some reason + ```py + plt.plot(nums, nums_squared, marker='o') + plt.xlim(-2, 15) + ``` + ![](/assets/img/python-data-analysis/custom-tick-limits.png) +- **legend** - helps distinguish between the different graphs using **labels** when they are on the same **axes** in the same **figure** + ```py + nums = [1, 2, 3, 4] + nums_squared = [num ** 2 for num in nums] + nums_cubed = [num ** 3 for num in nums] + + plt.plot(nums, nums, label='linear') + plt.plot(nums, nums_squared, label='squared') + plt.plot(nums, nums_cubed, label='cubed') + + plt.legend() + ``` + ![](/assets/img/python-data-analysis/same-figure-same-axes-with-legend.png) +- plotting **bar** charts. assume we have the following data - + ```py + plants = ['spinach', 'turnip', 'rhubarb', 'broccoli', 'kale'] + died = [10, 25, 5, 30, 21] + germinated = [74, 88, 56, 69, 59] + ``` +- by default, the different charts would be one on top of another - + ```py + plt.bar(plants, germinated) + plt.bar(plants, died) + ``` + ![](/assets/img/python-data-analysis/plotting-bar-basics.png) +- this is how i got them to show one beside another - + - i ensured **width** of the first graph is positive while the second one is negative, so that they appear on either sides of the x **tick** + - i also ensured they are **0.25%** of their actual width, as this ensures the right spacing. if for e.g. i did 0.5, the second bar of a tick will touch the first bar of the next tick + - i set **align** to **edge**. this alsigns them to the edge of the tick. the default is **center** (refer the graph created by default above) + + ```py + plt.bar(plants, germinated, width=0.25, align='edge') + plt.bar(plants, died, width=-0.25, align='edge') + ``` + ![](/assets/img/python-data-analysis/multiple-bar-graph-alignment-side-by-side.png) +- bar also receives another keyword argument - **bottom** + ```py + plt.bar(plants, germinated, bottom=[20, 20, 20, 20, 20]) + plt.ylim(0, 120) + ``` + ![](/assets/img/python-data-analysis/bar-graph-with-custom-bottom.png) +- use case of the stupidity above 🤣 - we can get the different bars to **stack** one on top of another. the y coordinates of one graph becomes the bottom of another + ```py + plt.bar(plants, died, bottom=germinated, label='died') + plt.bar(plants, germinated, label='germinated') + plt.legend() + ``` + ![](/assets/img/python-data-analysis/stacked-bar-graph.png) +- we can use **barh** instead of bar for horizontal bar graphs. notice how for **stacking**, the **bottom** changes to **left** + ```py + plt.barh(plants, died, left=germinated, label='died') + plt.barh(plants, germinated, label='germinated') + plt.legend() + ``` + ![](/assets/img/python-data-analysis/stacked-horizontal-bar-graph.png) +- **histogram** - assume we have the following data. note - i did a value count to explain the distribution of data - + ```py + nums = [1,2,2,3,5,4,2,2,1,1,3,4,4,2,1,5,2,3,4,5] + + { num: nums.count(num) for num in nums } + # {1: 4, 2: 6, 3: 3, 4: 4, 5: 3} + ``` +- when i try to create a histogram on this data, it looks as follows by default - + ```py + plt.hist(nums) + ``` + ![](/assets/img/python-data-analysis/default-histogram-without-binning.png) +- we can configure the **bins** as follows. my understanding - 1 and 2 together have frequency of 10, 3 has frequency of 3 while 4 and 5 together have frequency of 7. now, the range has been divided into three parts 1-2.33, 2.33-3.66, 3.66-4.99, and the histogram has been plotted accordingly + ```py + plt.hist(nums, bins=3) + ``` + ![](/assets/img/python-data-analysis/histogram-with-custom-binning.png) +- **histograms** are a little different i feel because unlike pie chart, bar graph, etc where we give the actual values to be plotted, here, we only give a series of values and it autmatically calculates the frequency and bins them accordingly +- a realisitic example - comparing the distribution of ages of people travelling in first class vs third class in the titanic. observation - more younger people were travelling in third class, wheras more older people were travelling in first class. also, note how we change the alpha to visualize them simultaneously + ```py + titanic = pd.read_csv('/content/drive/MyDrive/Python - Basic Data Analysis/titanic.csv') + + # clean the age column + titanic.replace({ 'age': { '?': None } }, inplace=True) + titanic['age'] = titanic['age'].astype('float') + + # extract the ages of first and third class + first_class_ages = titanic[titanic['pclass'] == 1]['age'] + third_class_ages = titanic[titanic['pclass'] == 3]['age'] + + # plot the ages + plt.hist(first_class_ages, alpha=0.5, label='first') + plt.hist(third_class_ages, alpha=0.5, label='third') + + plt.legend() + ``` + ![](/assets/img/python-data-analysis/histogram-basic-project.png) +- **pie charts** + - we use the **explode** parameter to disconnect the sectors from the pie chart. the fraction determines how far out the sectors would be from the pie. the order is the same as the order of the labels + - we use the **autopct** parameter to add percentages inside the sectors. we are using `autopct='%.0f%%'` here, if we would have used for e.g. `autopct='%.2f'`, it would have shown in this form - `57.73` (with 2 decimal places and without the `%`) + + ```py + plt.pie(costs, labels=labels, autopct='%0.0f%%', explode=(0, 0.1, 0, 0, 0.1)) + plt.show() + ``` + ![](/assets/img/python-data-analysis/pie-chart-example.png) +- **subplots** - multiple **axes** in the same **figure** + - we use **subplot** to tell the **dimensions** and the correct **subplot index**. in the example below, we say 1 row, 3 columns, and go 1, 2 and 3 respectively for the index + - **title** is used for individual **axes** headings while **suptitle** is used for the **figure** heading + - we call **tight layout**, as it helps python adjust the padding around subplots + + ```py + nums = [1, 2, 3, 4, 5] + nums_squared = [num ** 2 for num in nums] + nums_cubed = [num ** 3 for num in nums] + + plt.figure(figsize=(12, 4)) + plt.suptitle("Polynomials") + + plt.subplot(1, 3, 1) + plt.title("X") + plt.plot(nums, nums) + + plt.subplot(1, 3, 2) + plt.title("X Squared") + plt.plot(nums, nums_squared) + + plt.subplot(1, 3, 3) + plt.title("X Cubed") + plt.plot(nums, nums_cubed) + + plt.tight_layout() + plt.show() + ``` + ![](/assets/img/python-data-analysis/subplots-example.png) +- now, imagine if we go back to our titanic example, and we want to plot all three classes - first second and third in different subplots - + ```py + titanic = pd.read_csv('/content/drive/MyDrive/Python - Basic Data Analysis/titanic.csv') + titanic['age'] = pd.to_numeric(titanic['age'], errors='coerce') + + first_class_ages = titanic[titanic['pclass'] == 1]['age'] + second_class_ages = titanic[titanic['pclass'] == 2]['age'] + third_class_ages = titanic[titanic['pclass'] == 3]['age'] + + plt.figure(figsize=(12, 4)) + plt.suptitle('titanic class vs age distribution') + + plt.subplot(1, 3, 1) + plt.title('1st class') + plt.hist(first_class_ages) + + plt.subplot(1, 3, 2) + plt.title('2nd class') + plt.hist(second_class_ages) + + plt.subplot(1, 3, 3) + plt.title('3rd class') + plt.hist(third_class_ages) + + plt.tight_layout() + plt.show() + ``` + ![](/assets/img/python-data-analysis/titanic-subplot-without-shared-axes.png) +- issue - we know that the scale in the third vs other plots are different i.e. a lot more people are travelling in the third class than in the 2nd and 1st class. this is not evident right off the bat by looking at the graph. hence, we can specify the **sharey** parameter + ```py + # ... + axes = plt.subplot(1, 3, 1) + # ... + plt.subplot(1, 3, 2, sharey=axes) + # ... + plt.subplot(1, 3, 3, sharey=axes) + ``` + ![](/assets/img/python-data-analysis/titanic-subplot-with-shared-axes.png) +- question - write the code for achieving the below. note - do not use the plot method of pandas series and dataframes + ![](/assets/img/python-data-analysis/final-matplotlib-example.png) + ```py + houses = pd.read_csv('/content/drive/MyDrive/Python - Basic Data Analysis/kc_house_data.csv', parse_dates=['date']) + + sales_by_month = houses['date'].dt.month.value_counts().sort_index() + # date + # 1 978 + # 2 1250 + # 3 1875 + # 4 2231 + # 5 2414 + # 6 2180 + # 7 2211 + # 8 1940 + # 9 1774 + # 10 1878 + # 11 1411 + # 12 1471 + + sales_by_week_day = houses['date'].dt.day_of_week.value_counts().sort_index() + # date + # 0 4099 + # 1 4715 + # 2 4603 + # 3 3994 + # 4 3685 + # 5 287 + # 6 230 + + plt.figure(figsize=(10, 4)) + + plt.subplot(1, 2, 1) + week_days = ['Mon', 'Tue', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'] + plt.title('Sales by Week Day') + plt.xticks(sales_by_week_day.index, week_days) + plt.bar(sales_by_week_day.index, sales_by_week_day.values) + + plt.subplot(1, 2, 2) + plt.xticks(range(1, 13)) + plt.title('Sales by Month') + plt.plot(sales_by_month.index, sales_by_month.values) + + plt.tight_layout() + plt.show() + ``` +- note how we use **index** and **values** that we disccussed [here](#series-and-columns) +- we also had to sort by index first before beginning to plot, because value counts sorts by values by default +- notice the use of **xticks** for renaming the labels for weekdays. i had to do the same thing for months as well, otherwise the default was 2, 4, 6, 8... + +## Maptlotlip + Pandas + +- plotting a pandas **series** + ```py + titanic['sex'].value_counts().plot(kind='pie') + ``` + ![](/assets/img/python-data-analysis/pandas-series-example.png) +- plotting a pandas **dataframe** - note how it is making a bar for all columns automatically + ```py + house_area + # sqft_living sqft_lot + # 12777 13540 307752 + # 7252 12050 27600 + # 3914 10040 37325 + # 9254 9890 31374 + # 8092 9640 13068 + + house_area.plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/pandas-dataframe-example.png) +- ufo sightings by month - we use this series in the next few points, and this is what our data looks like - + ```py + ufo_sightings_by_month + # 1.0 5979 + # 2.0 4559 + # 3.0 5494 + # 4.0 5817 + # 5.0 6063 + # 6.0 8357 + # 7.0 10682 + # 8.0 8997 + # 9.0 8498 + # 10.0 8371 + # 11.0 7596 + # 12.0 6525 + ``` +- for providing parameters like **title**, we have two options - + - option 1 - in the same line. disadvantage - lesser options to configure styling etc + ```py + ufo_sightings_by_month.plot(kind='bar', title='UFO Sightings by Month', xlabel='month', ylabel='num. of sightings') + ``` + - option 2 - i think the central idea is instead of interacting only with pandas plot api, we mix with calls to matplotlib apis directly like we saw in [matplotlib](#matplotlib). advantage - now, we can configure styling etc + ```py + ufo_sightings_by_month.plot(kind='bar') + + plt.title('UFO Sightings by Month') + plt.xlabel('month') + plt.ylabel('num. of sightings') + ``` + + ![](/assets/img/python-data-analysis/ufo-sightings-by-month-numeric-month-labels.png) +- now, we would like to use months abbreviations instead. we have multiple options - + - option 1 - use [**rename**](#modifying-columns-and-indices) to rename indices + ```py + months_lookup = { idx + 1: months[idx] for idx in range(12) } + # {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'} + + ufo_sightings_by_month_abbrev = ufo_sightings_by_month.rename(index=months_lookup) + ufo_sightings_by_month_abbrev.plot(kind='bar', title='UFO Sightings by Month') + ``` + - option 2 - use [**xticks**](#matplotlib). this is useful if we just want to modify plots but it might make testing etc difficult + ```py + ufo_sightings_by_month.plot(kind='bar', title='UFO Sightings by Month') + plt.xticks(range(12), labels=months) + ``` + + ![](/assets/img/python-data-analysis/ufo-sightings-by-month-abbrev-month-labels.png) +- by default, bar charts for dataframes looks as follows. understand that pandas is coming with reasonable defaults and helpers. there was so much effort was required from our end when doing this manually using [matplotlib](#matplotlib) - specifying **labels** and **legends**, specifying the **align** property with a negative **width**, etc + ```py + salaries + # BasePay OvertimePay OtherPay + # EmployeeName + # NATHANIEL FORD 167411.18 0.00 400184.25 + # GARY JIMENEZ 155966.02 245131.88 137811.38 + # ALBERT PARDINI 212739.13 106088.18 16452.60 + # CHRISTOPHER CHONG 77916.00 56120.71 198306.90 + + salaries.plot(kind='barh') + ``` + ![](/assets/img/python-data-analysis/pandas-salaries-default-side-by-side.png) +- making a stacked version too is so much easier compared to doing it via [matplotlib](#matplotlib) manually by specifying **bottom** / **left** etc + ```py + salaries.plot(kind='barh', stacked=True) + ``` + ![](/assets/img/python-data-analysis/pandas-salaries-stacked.png) +- the usual way - `.plot(kind='hist')`. it creates all graphs in the same axes + ```py + salaries.plot(kind='hist') + ``` + ![](/assets/img/python-data-analysis/histogram-using-same-axes.png) +- calling `.hist()` directly. it creates different **axes** for the different columns - feels like **subplots** + ```py + salaries.hist() + ``` + ![](/assets/img/python-data-analysis/histogram-using-different-axes.png) +- **box plot** - this too helps visualize distribution of values like histogram. summary according to me, might be wrong - + - we have a line at the **median** (the green line) + - the general distribution of data lies between the two **whiskers** (the two standalone blue lines) + - the **fliers** depict the outliers (the circles). e.g. one house had 33 or so bedrooms, so look at the boxplot + + ```py + houses['bedrooms'].plot(kind='box') + ``` + ![](/assets/img/python-data-analysis/default-box-plot.png) +- we can view the list of configuration parameters [here](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.boxplot.html). e.g. we can disable the fliers + ```py + houses[['bedrooms', 'bathrooms']].plot(kind='box', showfliers=False) + ``` + ![](/assets/img/python-data-analysis/configured-box-plot.png) +- **scatter plot** - how different variables, e.g. bedrooms and bathrooms correlate to eachother. refer [this](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.scatter.html) for different configuration options + ```py + houses.plot(kind='scatter', x='bedrooms', y='bathrooms', marker='x', c='#2ca02c') + ``` + ![](/assets/img/python-data-analysis/scatter-plot-bedrooms-vs-bathrooms.png) +- adding multiple graphs to the same **axes** on the same **figure** - same as we saw in [matplotlib](#matplotlib) i.e. we need to call **figure** on plt for creating a new figure, else the current active figure is used +- e.g. - ufo sightings have a shape attribute. find the 5 most common shapes, and plot them on the same axes. use a legend to differentiate between them. plot them for the range 2000-2018 + ```py + common_shapes = ufos['shape'].value_counts().nlargest(5) + + for common_shape in common_shapes.index: + years = ufos[ufos['shape'] == common_shape]['date_time'].dt.year + years.value_counts().sort_index().plot(kind='line', label=common_shape) + + plt.legend() + plt.xlim(2000, 2018) + plt.title('UFO Sightings by Shape (2000-2018)') + ``` + ![](/assets/img/python-data-analysis/ufo-value-counts-by-shape.png) +- e.g. plot how blinding lights performed on the charts. note how we can specify the x and y attributes when plotting dataframes. also, note how we can invert the y axis - a rank is better when lower, and we want to show a higher rank using a peak / lower rank using a trench + ```py + billboard_charts = pd.read_csv('/content/drive/MyDrive/Python - Basic Data Analysis/billboard_charts.csv', parse_dates=['date']) + blinding_lights = billboard_charts[billboard_charts['song'] == 'Blinding Lights'] + + blinding_lights.plot(y='rank', x='date') + plt.gca().invert_yaxis() + plt.title('Blinding Lights Chart Performance') + ``` + ![](/assets/img/python-data-analysis/blinding_lights_chart_performance.png) +- when we try plotting a dataframe, the different columns would be plotted on the same axes by default + ```py + salaries.plot(kind='hist') + ``` + ![](/assets/img/python-data-analysis/dataframe-default-without-subplots.png) +- we can create subplots instead just by passing in keyword arguments + ```py + salaries.plot(kind='hist', subplots=True) + ``` + ![](/assets/img/python-data-analysis/dataframe-with-subplots.png) +- we can configure other parameters like **layout** (the **dimensions**), **sharex** / **sharey**, etc as well, already discussed in [matplotlib](#matplotlib) + ```py + salaries.plot(kind='hist', subplots=True, layout=(1, 3), figsize=(20, 5), sharex=True, bins=30) + plt.tight_layout() + ``` + ![](/assets/img/python-data-analysis/dataframe-with-subplots-configured.png) +- note, my understanding - the above method of passing in true for the **subplots** keyword argument works because we wanted to plot the different columns of the same dataframe. what if we wanted to plot entirely different series etc on the same **figure** on different **axes**. we use a combination of interacting with matplotlib apis directly and through pandas apis. apis used - + - **subplots** can be called for setting the dimensions of the subplot, setting figure size, etc. it returns both the **figure** and the **axes** created in the process. the axes we receive has the same rows / columns as the dimensions we specify. note that parameters like `sharex` / `sharey` can be passed into this subplots call as well + - note how we pass in **axes** argument to **plot** of pandas series / dataframe + + ```py + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + + fig, axes = plt.subplots(2, 3, figsize=(15, 10)) + + data_2000 = ufos[ufos['date_time'].dt.year == 2000]['date_time'].dt.month.value_counts().sort_index() + data_2000.plot(kind='barh', ax=axes[0][0], ylabel='', title=2000) + axes[0][0].set_yticks(range(12), labels=months) + + data_2001 = ufos[ufos['date_time'].dt.year == 2001]['date_time'].dt.month.value_counts().sort_index() + data_2001.plot(kind='barh', ax=axes[0][1], ylabel='', title=2001) + axes[0][1].set_yticks(range(12), labels=months) + + data_2002 = ufos[ufos['date_time'].dt.year == 2002]['date_time'].dt.month.value_counts().sort_index() + data_2002.plot(kind='barh', ax=axes[0][2], ylabel='', title=2002) + axes[0][2].set_yticks(range(12), labels=months) + + data_2003 = ufos[ufos['date_time'].dt.year == 2003]['date_time'].dt.month.value_counts().sort_index() + data_2003.plot(kind='barh', ax=axes[1][0], ylabel='', title=2003) + axes[1][0].set_yticks(range(12), labels=months) + + data_2004 = ufos[ufos['date_time'].dt.year == 2004]['date_time'].dt.month.value_counts().sort_index() + data_2004.plot(kind='barh', ax=axes[1][1], ylabel='', title=2004) + axes[1][1].set_yticks(range(12), labels=months) + + data_2005 = ufos[ufos['date_time'].dt.year == 2005]['date_time'].dt.month.value_counts().sort_index() + data_2005.plot(kind='barh', ax=axes[1][2], ylabel='', title=2005) + axes[1][2].set_yticks(range(12), labels=months) + + plt.suptitle(f'UFO Sightings by Months (2000-2005)') + + plt.tight_layout() + ``` + ![](/assets/img/python-data-analysis/custom-subplots-using-pandas.png) +- for e.g. reproduce the graph below - + ![](/assets/img/python-data-analysis/christmas-songs-project.png) + - this time around, there is just one axes. so, we can call set xticks, set title, etc on this one axes itelf + - again, since this is ranks of songs, we invert the y axis + - the labels on x axes was another challenge here, but easy when using **xticks** + - pandas, matplotlib, etc are smart enough to understand dates even if we specify them like strings - note how we specify strings for dates when using **in between** and setting **xticks** + + ```py + years = [2016, 2017, 2018, 2019, 2020] + christmases = [f'{year}-12-25' for year in years] + # ['2016-12-25', '2017-12-25', '2018-12-25', '2019-12-25', '2020-12-25'] + + songs = [ + { 'song': 'All I Want For Christmas Is You', 'artist': 'Mariah Carey' }, + { 'song': 'Rockin\' Around The Christmas Tree', 'artist': 'Brenda Lee' }, + { 'song': 'Jingle Bell Rock', 'artist': 'Bobby Helms' } + ] + + period = billboard_charts['date'].between(christmases[0], christmases[-1]) + + _, axes = plt.subplots(1, 1, figsize=(10, 7)) + + plt.gca().invert_yaxis() + + years = [2016, 2017, 2018, 2019, 2020] + christmas_values = [pd.to_datetime(f'12-25-{year}') for year in years] + christmas_labels = [f'Xmas {year}' for year in years] + + axes.set_xticks(christmas_values, christmas_labels) + axes.set_title('Christmas Songs on the Hot') + + for song in songs: + condition = (billboard_charts['song'] == song['song']) & (billboard_charts['artist'] == song['artist']) + billboard_charts[condition & period].plot(kind='line', x='date', y='rank', ax=axes, label=song['song'], xlabel='') + + plt.legend(loc='upper left') + plt.tight_layout() + plt.show() + ``` +- for saving a figure to a local file, use `savefig(path.png)` + +## Grouping and Aggregation + +- assume i have data for stocks of different cars like below - + ```py + car_stocks + + # Symbol Date Open High Low Close Adj Close Volume + # 0 RIVN 2021-11-10 106.750000 119.459999 95.199997 100.730003 100.730003 103679500 + # 1 RIVN 2021-11-11 114.625000 125.000000 108.010002 122.989998 122.989998 83668200 + # 2 RIVN 2021-11-12 128.645004 135.199997 125.250000 129.949997 129.949997 50437500 + ``` +- to get the mean of a particular stock, i can do the following - + ```py + car_stocks[car_stocks['Symbol'] == 'RIVN']['Close'].mean() # 127.523 + ``` +- but what if i wanted the mean of all of the stocks individually in a dataframe? i can do it as follows + ```py + car_stocks.groupby('Symbol')['Close'].mean() + + # Symbol + # GM 62.164615 + # LCID 49.829231 + # RIVN 127.523077 + # Name: Close, dtype: float64 + ``` +- notice how **groupby** gives us a pandas **data frame group by object** + ```py + car_stocks.groupby('Symbol') + + # + ``` +- we can call **ngroups** to see the number of groups - + ```py + car_stocks.groupby('Symbol').ngroups # 3 + ``` +- we can call **groups** to see the actual groups. it is a dictionary, where the keys are the actual keys we used to group, while the values are the **indices** of the rows + ```py + car_stocks.groupby('Symbol').groups + + # {'GM': [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], + # 'LCID': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], + # 'RIVN': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]} + ``` +- iterating over the **dataframe group by object** using a for each group - we get back a tuple of the form group name, dataframe. ofcourse, the dataframe only contains rows belonging to the group. the columns used in the **group by** clause would not be present in the dataframe. use case - useful when the aggregation functions available to us by default are not enough, and we want to run some custom functionality + ```py + for name, group in car_stocks.groupby('Symbol'): + print(name) + print('--------------------') + print(group[['Low', 'High']].describe()) + print('\n') + + # GM + # -------------------- + # Low High + # mean 61.051539 63.129231 + # min 57.730000 60.560001 + # max 62.630001 65.180000 + # + # + # LCID + # -------------------- + # Low High + # mean 46.442539 51.811538 + # min 39.341000 45.000000 + # max 50.709999 57.750000 + # + # + # RIVN + # -------------------- + # Low High + # mean 119.150000 135.309230 + # min 95.199997 114.500000 + # max 153.779999 179.470001 + ``` +- when we tried calculating the mean of the closing price earlier - + - when we did `car_stocks.groupby('Symbol')`, we got back a **dataframe group by object** + - when we added a `car_stocks.groupby('Symbol')['Close']`, we got back a **series group by object** + - we finally called `car_stocks.groupby('Symbol')['Close'].mean()` to get back the mean of closing price for each symbol (i.e. stock) +- if we would have called mean on the **dataframe group by object** directly, we would have gotten back a **dataframe** - + ```py + car_stocks.groupby('Symbol').mean() + + # Open High Low + # Symbol + # GM 61.937693 63.129231 61.051539 + # LCID 48.761538 51.811538 46.442539 + # RIVN 127.710000 135.309230 119.150000 + ``` +- **split**, **apply**, **combine** - this is a workflow. going back to the closing price mean by stock example - + - we split into different parts - e.g. forming groups using groupby + - we then apply a function on each of the parts - e.g. performing a mean on each of these groups individually + - we finally combine the results from each of these parts - we get back a series containing means for each of the group +- we can also run multiple aggregation functions at once - below, we run it on both **dataframe group by object** and **series group by object**. running it on the dataframe group by object results in [hierarchical columns](#hierarchical-columns) - + ```py + car_stocks.groupby('Symbol')['Close'].agg(['mean', 'min', 'max']) + + # mean min max + # Symbol + # GM 62.164615 59.270000 64.610001 + # LCID 49.829231 40.750000 55.520000 + # RIVN 127.523077 100.730003 172.009995 + + car_stocks.groupby('Symbol').agg(['mean', 'min', 'max']) + # Open High Low + # mean min max mean min max mean min max + # Symbol + # GM 61.937693 57.849998 64.330002 63.129231 60.560001 65.180000 61.051539 57.730000 62.630001 + # LCID 48.761538 42.299999 56.200001 51.811538 45.000000 57.750000 46.442539 39.341000 50.709999 + # RIVN 127.710000 106.750000 163.800003 135.309230 114.500000 179.470001 119.150000 95.199997 153.779999 + ``` +- we can go more granular as well - we can run specific aggregation functions for specific columns as well - + ```py + car_stocks.groupby('Symbol').agg({ 'Open': ['min', 'max'], 'Close': ['mean'] }) + + # Open Close + # min max mean + # Symbol + # GM 57.849998 64.330002 62.164615 + # LCID 42.299999 56.200001 49.829231 + # RIVN 106.750000 163.800003 127.523077 + ``` +- we can provide custom functions to agg as well - understand that this could very well have been a function from a library, and we would just have to pass its reference - + ```py + def range(x): + return x.max() - x.min() + + car_stocks.groupby('Symbol')['Open'].agg(range) + + # Symbol + # GM 6.480004 + # LCID 13.900002 + # RIVN 57.050003 + ``` +- x is a **pandas series**, and range is called for every group - for all open prices for a particular stock, one at a time +- another example - this time, our custom aggregation function is called for multiple attributes, but everything is still the same. just that the output changes from a series to a dataframe, but the aggregation function is still called on a per attribute, per group basis + ```py + def count_nulls(x): + return len(x) - x.count() + + titanic.groupby('pclass').agg(count_nulls) + + # survived age sex + # pclass + # 1 0 39 0 + # 2 0 16 0 + # 3 0 208 0 + ``` +- **named aggregations** - we just saw nested columns above, when we try to do multiple aggregations on multiple columns at once. this can make accessing data more complicated, since we would have to use [hierarchical columns](#hierarchical-columns). in general, we might want to give a custom name to the result of our aggregation. we can do so using **named aggregations** - + ```py + car_stocks.groupby('Symbol').agg( + close_avg=('Close', 'mean'), + close_max=('Close', 'max'), + ) + + # close_avg close_max + # Symbol + # GM 62.164615 64.610001 + # LCID 49.829231 55.520000 + # RIVN 127.523077 172.009995 + ``` +- example - we have statisctics on a per player basis for laliga, having columns for team name, shots taken, shots on target +- we would like to generate the plot below - x axis of the plot is shared + ![](/assets/img/python-data-analysis/grouping-and-aggregations-project.png) +- generating the relevant data - + - first, we find total shots and shots on target by a team + - for this, we group by team, and perform sum aggregations for shots and shots on target + - we calculate accuracy using these two + - finally, we sort the data based on accuracy + + ```py + team_stats = laliga.groupby('Team').agg( + total=('Shots', 'sum'), + on_target=('Shots on target', 'sum') + ) + team_stats['accuracy'] = team_stats['on_target'] / team_stats['total'] + team_stats.sort_values(['accuracy'], inplace=True) + team_stats + + # total on_target accuracy + # Team + # SD Eibar 422 153 0.362559 + # D. Alavés 299 109 0.364548 + # CD Leganés 334 132 0.395210 + # R. Valladolid CF 319 131 0.410658 + # SD Huesca 343 142 0.413994 + ``` +- generating the plot - + - most accurate teams - top 5 rows, least accurate teams - bottom 5. use **head** and **tail** to obtain them + - we have entirely different pandas plots that we would like to plot on the same figure on different axes. so, we use **subplots**. subplots can apart from dimensions, receive the **sharex** parameter + - note how we pass the axes received from **subplots** to **plot** + - we can set the xticks on (any) axes i guess + + ```py + fig, axes = plt.subplots(2, 1, sharex=True) + + team_stats.tail(5).plot(kind='barh', y='accuracy', ax=axes[0], legend=False, title='Most Accurate Teams', color='green') + team_stats.head(5).plot(kind='barh', y='accuracy', ax=axes[1], legend=False, title='Least Accurate Teams', color='red') + + axes[0].set_xticks([0.1, 0.2, 0.3, 0.4, 0.5]) + + plt.tight_layout() + ``` +- we discuss [hierarchical indexing](#hierarchical-indexing) next, but we can group by levels of hierarchical indices as well. we need to specify the **levels** **keyword argument** for that + ```py + state_pops + + # population + # state year + # AK 1990 553290.0 + # 1991 570193.0 + # 1992 588736.0 + # ... ... ... + # WY 2009 559851.0 + # 2010 564222.0 + # 2011 567329.0 + + state_pops.groupby(level=['year']).sum() + + # year + # 1990 499245628.0 + # 1991 505961884.0 + # ... + # 2012 631398915.0 + # 2013 635872764.0 + ``` +- note - we specify name in this case, but we could have specified the **level** - 0, 1 etc as well +- if we see, the components of the **hierarchical index** are **named**, so specifying their names directly without the level keyword argument inside of groupby would have worked as well + ```py + state_pops.groupby('year').sum() + + # year + # 1990 499245628.0 + # 1991 505961884.0 + # ... + # 2012 631398915.0 + # 2013 635872764.0 + ``` +- summary - + - we saw grouping using attributes by now + - but then we might want to group by index / components of hierachical index as well + - hence we could use the **level** keyword argument + - but then, we could use the same syntax as attributes for indices as well i.e. omit the **level** keyword argument + +## Hierarchical Indexing + +- also called **multi indexing** +- when we group by a single column, we get the following result - + ```py + mean_by_sex = titanic.groupby('sex')['age'].mean() + + mean_by_sex.index + + # Index(['female', 'male'], dtype='object', name='sex') + + mean_by_sex + + # sex + # female 28.687071 + # male 30.585233 + ``` +- however, when we group by multiple columns, we get the following result - + ```py + mean_by_pclass_and_sex = titanic.groupby(['pclass', 'sex'])['age'].mean() + + mean_by_pclass_and_sex.index + + # MultiIndex([(1, 'female'), + # (1, 'male'), + # (2, 'female'), + # (2, 'male'), + # (3, 'female'), + # (3, 'male')], + # names=['pclass', 'sex']) + + mean_by_pclass_and_sex + + # pclass sex + # 1 female 37.037594 + # male 41.029250 + # 2 female 27.499191 + # male 30.815401 + # 3 female 22.185307 + # male 25.962273 + ``` +- so, labels instead of being a plain **index** are now **multi index** +- above, we showed a multi index with a **series**, below is an example of a **multi index** with a **dataframe**. the index in this case is the same as the one we got when doing a mean of age, only the entire data structure changes from a series to a dataframe + ```py + titanic.groupby(['pclass', 'sex']).mean(numeric_only=True) + + # survived age sibsp parch fare + # pclass sex + # 1 female 0.965278 37.037594 0.555556 0.472222 37.037594 + # male 0.340782 41.029250 0.340782 0.279330 41.029250 + # 2 female 0.886792 27.499191 0.500000 0.650943 27.499191 + # male 0.146199 30.815401 0.327485 0.192982 30.815401 + # 3 female 0.490741 22.185307 0.791667 0.731481 22.185307 + # male 0.152130 25.962273 0.470588 0.255578 25.962273 + ``` +- typically when seting up an index, we want it to - + - be unique - having the same index for multiple rows in a dataframe does not give an error. but, it is typically not advisable - e.g. [**loc**](#indexing) would give us multiple rows + - make our data easily accessible - use for e.g. semantic index / natural key +- imagine we have the following dataframe - + ```py + state_pops = pd.read_csv('data/state_pops.csv') + state_pops + + # state year population + # 0 AL 2012 4817528.0 + # 1 AL 2010 4785570.0 + # ... ... ... ... + # 1270 USA 2011 311582564.0 + # 1271 USA 2012 313873685.0 + ``` +- we can set up a [custom](#index) **hierarchical index** for this dataset + ```py + state_pops.set_index(['state', 'year'], inplace=True) + state_pops + + # population + # state year + # AL 2012 4817528.0 + # 2010 4785570.0 + # 2011 4801627.0 + # USA 2013 316128839.0 + # 2009 306771529.0 + # 2010 309326295.0 + ``` +- if we try [sorting the index](#index-and-sorting), by default, the data is sorted in the order of **levels** - e.g. the data is sorted first by state, and for a state, the rows are sorted by years + ```py + state_pops.sort_index() + + # population + # state year + # AK 1990 553290.0 + # 1991 570193.0 + # 1992 588736.0 + # WY 2009 559851.0 + # 2010 564222.0 + # 2011 567329.0 + ``` +- assume we want to sort the data by years only. so, all the data for the lowest year should come first and so on. we can do the below - + ```py + state_pops.sort_index(level=1) + + # population + # state year + # AK 1990 553290.0 + # AL 1990 4050055.0 + # ... ... ... + # WV 2013 1854304.0 + # WY 2013 582658.0 + ``` +- finally, assume we would like to sort in ascending order of state but then descending order of year. we can do the below - + ```py + state_pops.sort_index(level=[0, 1], ascending=[True, False]) + + # population + # state year + # AK 2013 735132.0 + # 2012 730307.0 + # ... ... ... + # WY 1994 480283.0 + # 1993 473081.0 + ``` +- finally - we were using numbers for levels till now, but names are supported as well - e.g. we can use `state_pops.sort_index(level=['year'], inplace=True)` +- **indexing** - behavior around slicing etc is pretty similar to what we studied [here](#indexing), just that we need to be wary of **levels** +- accessing by the first level only - we get back a **dataframe**, and not a **series** + ```py + state_pops.loc['WY'] + + # population + # year + # 1990 453690.0 + # 1991 459260.0 + # 1992 466251.0 + ``` +- accessing by all levels - we get back a series, where the indices are the columns. we need to provide a **tuple** with the values for all the levels. + ```py + state_pops.loc[('WY', 2013)] + + # population 582658.0 + ``` +- note - we can still use **slicing** etc when using tuples - + ```py + state_pops.loc[('WY', 2010) : ('WY', 2013)] + + # population + # state year + # WY 2010 564222.0 + # 2011 567329.0 + # 2012 576626.0 + # 2013 582658.0 + ``` +- till now, we saw accessing using the 1st level and all levels. what if we would like to access using some intermediate level(s)? +- first, recall from [updating](#updating-values), if we have a normal dataframe without the hierarchical indexing, we would use **loc** as follows (remember that `:` by itself means everything - all indices / all columns depending on where it is used) - + ```py + titanic + # pclass survived + # 0 1 1 + # 1 1 1 + # 2 1 0 + + titanic.loc[:, 'pclass'] + # 0 1 + # 1 1 + # 2 1 + + titanic.loc[:, ['pclass']] + # pclass + # 0 1 + # 1 1 + # 2 1 + + titanic.loc[:, :] + # pclass survived + # 0 1 1 + # 1 1 1 + # 2 1 0 + ``` +- so, extending on the above for a **dataframe** with **hierarchical indexing**, my understanding is we will need extra commas for the extra levels. so, back to our original question of how to access using selective levels when we have hierarchical indexing - we can for e.g. just use `:` for the levels for which we want everything, and specify singular values using `a`, specify ranges like `a:b`, specify selected values using `[a,b]` etc based on use case + ```py + state_pops.loc[:,:,] + # population + # state year + # AK 1990 553290.0 + # 1991 570193.0 + # ... ... ... + # WY 2009 559851.0 + # 2010 564222.0 + + # since we specify only one year + # pandas would eliminate this column altogether + state_pops.loc[:,2010,:] + # population + # state + # AK 713868.0 + # AL 4785570.0 + # AR 2922280.0 + # AZ 6408790.0 + # CA 37333601.0 + # CO 5048196.0 + + state_pops.loc[:,[2010,2013],:] + # population + # state year + # AK 2010 713868.0 + # 2013 735132.0 + # AL 2010 4785570.0 + # 2013 4833722.0 + # ... ... ... + # WV 2010 1854146.0 + # 2013 1854304.0 + # WY 2010 564222.0 + # 2013 582658.0 + + state_pops.loc[:,2010:2012,:] + # population + # state year + # AK 2010 713868.0 + # 2011 723375.0 + # 2012 730307.0 + # AL 2010 4785570.0 + # 2011 4801627.0 + # ... ... ... + # WV 2011 1855184.0 + # 2012 1856680.0 + # WY 2010 564222.0 + # 2011 567329.0 + # 2012 576626.0 + ``` +- **cross section** or `xs` is another useful alternative to the **loc** syntax when using **hierarhcical indexing**. i will stick to loc for now though + +### Accessing Hierachical Index Values + +- for accessing all values of a column, we use the syntax `df['col_name']`, but this would not work for index column(s) +- to access the [values of an index](#series-and-columns) when a dataframe does not have hierarchical indexing, we use `df.index` +- what if we wanted to access the components of a **hierarchical index**? assume our dataframe looks like this - + ```py + # population + # state year + # AK 1990 553290.0 + # 1991 570193.0 + # 1992 588736.0 + # ... ... ... + # WY 2009 559851.0 + # 2010 564222.0 + # 2011 567329.0 + ``` +- to access the index values of a particular position, we can use the following - + ```py + state_pops.index[0] # ('AK', 1990) + state_pops.index[1] # ('AK', 1991) + state_pops.index[2] # ('AK', 1992) + ``` +- to access all the index values, we have two options according too my understanding - +- option 1 - access via the **levels** property. but, it will only have the unique values - it would not be an accurate representation of our data + ```py + state_pops.index.levels + # FrozenList([['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'VT', 'WA', 'WI', 'WV', 'WY'], + # [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013]] + + state_pops.index.levels[0] + # Index(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', + # 'VT', 'WA', 'WI', 'WV', 'WY'] + + state_pops.index.levels[1] + # Index([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, + # 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013], + ``` +- option 2 - accessing via **get level values**. usecase - recall how we performed [filtering](#filtering) using column attributes - `df[df['col'] > 500]`. we can do the same when using option 2. our conditions will look like this now - `df[df.index.get_level_values(1) > 500]` + ```py + state_pops.index.get_level_values(0) + # Index(['AK', 'AK', 'AK', 'AK', 'AK', 'AK', 'AK', 'AK', 'AK', 'AK', + # ... + # 'WY', 'WY', 'WY', 'WY', 'WY', 'WY', 'WY', 'WY', 'WY', 'WY'], + + state_pops.index.get_level_values(1)[:50] + # Index([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, + # 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, + # 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, + # 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, + # 1990, 1991], + ``` + +### Hierarchical Columns + +- we typically work with them when we use [groupings and aggregations](#grouping-and-aggregation) without for e.g. flattening them using **named aggregations**. assume we have created the following dataframe - + ```py + titanic_stats = titanic.groupby(['pclass', 'sex']).agg({ + 'fare': ['sum', 'mean'], + 'age': ['min', 'max', 'mean'] + }) + titanic_stats + + # fare age + # sum mean min max mean + # pclass sex + # 1 female 4926.0000 37.037594 2.0000 76.0 37.037594 + # male 6195.4167 41.029250 0.9167 80.0 41.029250 + # 2 female 2832.4167 27.499191 0.9167 60.0 27.499191 + # male 4868.8333 30.815401 0.6667 70.0 30.815401 + # 3 female 3372.1667 22.185307 0.1667 63.0 22.185307 + # male 9060.8333 25.962273 0.3333 74.0 25.962273 + ``` +- now, if we try inspecting the **columns** property of the **dataframe**, we see the below - + ```py + titanic_stats.columns + # MultiIndex([('fare', 'sum'), + # ('fare', 'mean'), + # ( 'age', 'min'), + # ( 'age', 'max'), + # ( 'age', 'mean')], + # ) + ``` +- to access the individual columns, we can access them using the two options below. recall that when we try accessing a column, we get back a series - the labels of this series is the same as the original dataframe (which in this case is a **hierarchical index**), while the values of the series are the values of the column. note - the second option is preferred / more efficient i think, because we access the desired data in one go - + ```py + titanic_stats['fare']['sum'] # option 1 + titanic_stats[('fare', 'sum')] # option 2 + + # pclass sex + # 1 female 4926.0000 + # male 6195.4167 + # 2 female 2832.4167 + # male 4868.8333 + # 3 female 3372.1667 + # male 9060.8333 + ``` + +### Unstack + +- helps pivot the index to columns. if we do not specify the level, the **largest** / **innermost** level is used +- assume we have the following **series** - + ```py + titanic_age_stats = titanic.groupby(['pclass', 'sex'])['age'].mean() + titanic_age_stats + + # pclass sex + # 1 female 37.037594 + # male 41.029250 + # 2 female 27.499191 + # male 30.815401 + # 3 female 22.185307 + # male 25.962273 + ``` +- when we try plotting it, we get the following. recollection of how ploting of pandas **series** works by default - x axis is the **index** (which is **hiearchical index** / **multi index** in this case), y axis is the values + ```py + titanic_age_stats.plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/plotting-hierarchical-index-without-unstack.png) +- when we unstack without any arguments, the below is what happens - the innermost level of sex becomes a column + ```py + titanic_age_stats.unstack() + + # sex female male + # pclass + # 1 37.037594 41.029250 + # 2 27.499191 30.815401 + # 3 22.185307 25.962273 + ``` +- now when we try plotting this, we get the below. recollection of how plotting for a dataframe works - we get a bar for every attribute for every index. the values of these attributes is the y axis, the labels are the x axis + ```py + titanic_age_stats.unstack().plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/plotting-hierarchical-index-with-unstack.png) +- we can also specify the **level** we would like to unstack using - + ```py + titanic_age_stats.unstack(level='pclass') + + # pclass 1 2 3 + # sex + # female 37.037594 27.499191 22.185307 + # male 41.029250 30.815401 25.962273 + + titanic_age_stats.unstack(level='pclass').plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/plotting-hierarchical-index-with-custom-unstack.png) +- note, my understanding - we have till now performed **unstack** on a **series** with **hierarchical index**. this results in a **dataframe**, where the column is the level that we unstack, and a **level** from the **hierarhcical index** is removed +- complicating things because i am bored - when we try unstacking a **dataframe** with **hierarchical columns** - we get an additional level of **hierarchical columns** + ```py + titanic_age_stats = titanic.groupby(['pclass', 'sex']).agg({ + 'age': ['min', 'max', 'mean'] + }) + titanic_age_stats + # age + # min max mean + # pclass sex + # 1 female 2.0000 76.0 37.037594 + # male 0.9167 80.0 41.029250 + # 2 female 0.9167 60.0 27.499191 + # male 0.6667 70.0 30.815401 + # 3 female 0.1667 63.0 22.185307 + # male 0.3333 74.0 25.962273 + + titanic_age_stats.unstack() + # age + # min max mean + # sex female male female male female male + # pclass + # 1 2.0000 0.9167 76.0 80.0 37.037594 41.029250 + # 2 0.9167 0.6667 60.0 70.0 27.499191 30.815401 + # 3 0.1667 0.3333 63.0 74.0 22.185307 25.962273 + ``` + +## Textual Data + +- by default, pandas assigns type object to columns if they cannot be assigned numeric data types. object data type encompasses strings, numbers, arrays, etc everything +- my understanding - even if a column is of type object, we can access string methods on it. the other option i believe is to convert it to string type first using [**astype**](#data-types) +- we can access string methods using **str** + ```py + titanic['name'].str.lower() + + # 0 allen, miss. elisabeth walton + # 1 allison, master. hudson trevor + # ... + # 1307 zakarian, mr. ortin + # 1308 zimmerman, mr. leo + ``` +- understand that we just used lower on the column, but pandas was smart enough to apply it to the entire series. this is also applicable to **string indexing**. e.g. the cabin column looks like below - it is a combination of deck and cabin number, and we make a new column just for deck as follows + ```py + titanic['cabin'] + + # 0 B5 + # 1 C22 C26 + # 2 C22 C26 + # 3 C22 C26 + + titanic['deck'] = titanic['cabin'].str[0] + titanic['deck'] + # 0 B + # 1 C + # 2 C + # 3 C + ``` +- we can use slicing etc as well +- **strip** - strips whitespaces by default + ```py + s = pd.Series(['1. Hawk. ', '2. Pickle!\n', '3. Melonhead?\t']) + s + + # 0 1. Hawk. + # 1 2. Pickle!\n + # 2 3. Melonhead?\t + + s.str.strip() + + # 0 1. Hawk. + # 1 2. Pickle! + # 2 3. Melonhead? + ``` +- note - more features of the **strip** api - + - specify the characters to strip using the **to_strip** parameter + - it also has different versions - **lstrip** and **rstrip** to only strip from beginning / end +- **split** - split strings into components. by default, the output would be a list for every string + ```py + titanic['home.dest'].str.split('/') + + # 0 [St Louis, MO] + # 1 [Montreal, PQ , Chesterville, ON] + # 2 [Montreal, PQ , Chesterville, ON] + # 3 [Montreal, PQ , Chesterville, ON] + # 4 [Montreal, PQ , Chesterville, ON] + ``` +- we can make each element its own **series** / **column** by setting the **expand** option to true + ```py + titanic['home.dest'].str.split('/', expand=True) + + # 0 1 2 + # 0 St Louis, MO None None + # 1 Montreal, PQ Chesterville, ON None + # 2 Montreal, PQ Chesterville, ON None + ``` +- note - more features of the **split** api - + - a regex instead of a normal sring to split based on + - we can specify the maximum limit i.e. the maximum number of columns the split should go upto. no more splits would be created, and everything would be put into the last column +- **replace** - we have already seen [replace](#modifying-columns-and-indices), but this is the replace method available for string data type + ```py + ufos['duration'] + # 0 5 seconds + # 1 3-5 seconds + # 2 NaN + # 3 10 seconds + + ufos['duration'].str.replace('seconds', 's') + # 0 5 s + # 1 3-5 s + # 2 NaN + # 3 10 s + ``` +- above was a simple use case, but we can get very complicated with **replace** - refer [docs](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.replace.html) - we can match using **regex**, and instead of passing in what to replace with, we can pass a **callable** which would be called using the for e.g. regex that was matched +- **contains** - returns a boolean +- again instead of a plain string, we can pass in a regex to match as well +- a complex example - imagine the movies in our dataset have a "genres" column, which are separated by pipes. we can find the genre value counts as follows using **explode** - + ```py + movies['genres'] + + # 0 Animation|Comedy|Family + # 1 Adventure|Fantasy|Family + # 2 Romance|Comedy + # 3 Comedy|Drama|Romance + + movies['genres'].str.split('|').explode().value_counts() + + # Drama 20054 + # Comedy 13067 + # Thriller 7565 + # Romance 6662 + # Action 6542 + ``` + +## Apply and Map + +- **apply** - run on every value of the **series** + ```py + titanic['age'] + # 0 29.0000 + # 1 0.9167 + # 2 2.0000 + # 3 30.0000 + # 4 25.0000 + + titanic['age'].apply(lambda x: (x, x * 365)) + # 0 (29.0, 10585.0) + # 1 (0.9167, 334.59549999999996) + # 2 (2.0, 730.0) + # 3 (30.0, 10950.0) + # 4 (25.0, 9125.0) + ``` +- in case our function requires arguments, we can pass them as so - + ```py + titanic['fare'] + + # 0 211.3375 + # 1 151.5500 + # 2 151.5500 + + def currency_conveter(amount, denomination, multiplier): + return f'{denomination}{amount * multiplier}' + + titanic['fare'].apply(currency_conveter, args=('$', 23)) + + # 0 $4860.7625 + # 1 $3485.65 + # 2 $3485.65 + ``` +- till now, we saw **apply** for **series**. when using **apply** on a **dataframe**, it will call the function for all **columns** by default. so, if we return back one value per column, we get back a **series**, where the labels are column names + ```py + titanic[['age', 'fare', 'pclass']].apply(lambda col: col.max() - col.min()) + + # age 79.8333 + # fare 512.3292 + # pclass 2.0000 + ``` +- we can change it to be called for all **rows** instead. usecase - we have a complex calculation that involves multiple columns of the row. e.g. we have two columns, representing (number of siblings and spouses) and (number of parents and children) respectively. we can get the family size by adding the two. we need to pass in the **axis** argument, which is **index** by default + ```py + titanic['relatives'] = titanic.apply(lambda row: row['sibsp'] + row['parch'], axis='columns') + ``` +- note - doing `titanic['relatives'] = titanic['sibsp'] + titanic['parch']` would also have worked in this case +- **map** (for **series**) - we pass it a dictionary, and it will replace any values matching the key of the dictionary with the value for that key + ```py + titanic['pclass'] + + # 0 1 + # 1 1 + # ... + # 1307 3 + # 1308 3 + + titanic['pclass'].map({ 1: '1st', 2: '2nd', 3: '3rd' }) + + # 0 1st + # 1 1st + # ... + # 1307 3rd + # 1308 3rd + ``` +- we can also pass a function to **map**, and **map** and **apply** will work in the same way in this case +- when we use **map** on **dataframes**, the function is run on all cells of the **dataframe**. recall how **apply** was only run along one of the axis - so, the function was either passed the entire row or the entire column + ```py + titanic[['name', 'home.dest']] + + # name home.dest + # 0 Allen, Miss. Elisabeth Walton St Louis, MO + # 1 Allison, Master. Hudson Trevor Montreal, PQ / Chesterville, ON + # 2 Allison, Miss. Helen Loraine Montreal, PQ / Chesterville, ON + + titanic[['name', 'home.dest']].map(lambda str: str.capitalize()) + + # name home.dest + # 0 Allen, miss. elisabeth walton St louis, mo + # 1 Allison, master. hudson trevor Montreal, pq / chesterville, on + # 2 Allison, miss. helen loraine Montreal, pq / chesterville, on + ``` + +## Combining Dataframes + +### Concat + +- **concat** - concatenate series / dataframes + ```py + import pandas as pd + + s1 = pd.Series(['a', 'b', 'c']) + s2 = pd.Series(['d', 'e', 'f']) + + pd.concat([s1, s2]) + # 0 a + # 1 b + # 2 c + # 0 d + # 1 e + # 2 f + ``` +- we can set **ignore index** to true if our index was not semantic. notice the difference in the index values above and below + ```py + pd.concat([s1, s2], ignore_index=True) + # 0 a + # 1 b + # 2 c + # 3 d + # 4 e + # 5 f + ``` +- we can concatenate **by index** follows - + ```py + pd.concat([s1, s2], axis='columns') + # 0 1 + # 0 a d + # 1 b e + # 2 c f + ``` +- however, this is not just putting side by side - it is actually using the index values to join. e.g. - + ```py + food = pd.Series( + data=['avocado', 'blueberry', 'cucumber'], + index=['a', 'b', 'c'] + ) + + animals = pd.Series( + data=['dolphin', 'bear', 'chameleon'], + index=['d', 'b', 'c'] + ) + + pd.concat([food, animals], axis='columns') + + # 0 1 + # a avocado NaN + # b blueberry bear + # c cucumber chameleon + # d NaN dolphin + ``` +- notice the column names would be numeric by default. we can change that using the **keys** keyword argument + ```py + pd.concat([food, animals], axis='columns', keys=['khana', 'janwar']) + + # khana janwar + # a avocado NaN + # b blueberry bear + # c cucumber chameleon + # d NaN dolphin + ``` +- note - we saw NaN earlier, because the join is **outer** by default. we can set it to **inner** as well + ```py + pd.concat([food, animals], axis='columns', join='inner') + + # 0 1 + # b blueberry bear + # c cucumber chameleon + ``` +- till now, we were combining **series**. now, we combine **dataframes**. assume we have the data below - + ```py + harvest_21 = pd.DataFrame( + [['potatoes', 9001], ['garlic', 1350], ['onions', 87511]], + columns=['crop', 'qty'] + ) + # crop qty + # 0 potatoes 9001 + # 1 garlic 1350 + # 2 onions 87511 + + harvest_22 = pd.DataFrame( + [[1600, 'garlic'], [560, 'spinach'], [999, 'turnips'], [1000, 'onions']], + columns=['qty', 'crop'] + ) + # qty crop + # 0 1600 garlic + # 1 560 spinach + # 2 999 turnips + # 3 1000 onions + ``` +- when we try to concatenate the two dataframes, we get the below. note - even though the ordering of columns for the two dataframes were different, pandas combines them using the column names + ```py + pd.concat([harvest_21, harvest_22]) + # crop qty + # 0 potatoes 9001 + # 1 garlic 1350 + # 2 onions 87511 + # 0 garlic 1600 + # 1 spinach 560 + # 2 turnips 999 + # 3 onions 1000 + ``` +- assume we have another dataframe with an extra column - + ```py + harvest_23 = pd.DataFrame( + [['potatoes', 900, 500], ['garlic', 1350, 1200], ['onions', 875, 950]], + columns=['crop', 'qty', 'profit'] + ) + # crop qty profit + # 0 potatoes 900 500 + # 1 garlic 1350 1200 + # 2 onions 875 950 + ``` +- if we now try concatenating two dataframes with difference in columns, we get NaN for the missing columns + ```py + pd.concat([harvest_22, harvest_23]) + # qty crop profit + # 0 1600 garlic NaN + # 1 560 spinach NaN + # 2 999 turnips NaN + # 3 1000 onions NaN + # 0 900 potatoes 500.0 + # 1 1350 garlic 1200.0 + # 2 875 onions 950.0 + ``` +- to change this behavior, we can specify **inner** for the join type + ```py + pd.concat([harvest_22, harvest_23], join='inner') + # qty crop + # 0 1600 garlic + # 1 560 spinach + # 2 999 turnips + # 3 1000 onions + # 0 900 potatoes + # 1 1350 garlic + # 2 875 onions + ``` +- the **ignore index** parameter behaves in the same way, already discussed +- we can also set up **hierarchical indexing** using the **keys** parameter - e.g. it is typical to analyze files for different years simultaneously, and we might want to encode this information in the form of a hierarchical index for the dataframe + ```py + pd.concat([harvest_21, harvest_22, harvest_23], join='inner', keys=[2021, 2022, 2023]) + # crop qty + # 2021 0 potatoes 9001 + # 1 garlic 1350 + # 2 onions 87511 + # 2022 0 garlic 1600 + # 1 spinach 560 + # 2 turnips 999 + # 3 onions 1000 + # 2023 0 potatoes 900 + # 1 garlic 1350 + # 2 onions 875 + ``` + +### Merge + +- its closer to a database style join and is more flexible than [**concat**](#concat) since we can combine using columns instead of relying on the index + ```py + teams = pd.DataFrame( + [ + ["Suns", "Phoenix", 20, 4], + ["Mavericks", "Dallas", 11, 12], + ["Rockets", "Houston", 7, 16], + ["Nuggets", "Denver", 11, 12] + ], + columns=["team", "city", "wins", "losses"] + ) + # team city wins losses + # 0 Suns Phoenix 20 4 + # 1 Mavericks Dallas 11 12 + # 2 Rockets Houston 7 16 + # 3 Nuggets Denver 11 12 + + cities = pd.DataFrame( + [ + ["Houston", "Texas", 2310000], + ["Phoenix", "Arizona", 1630000], + ["San Diego", "California", 1410000], + ["Dallas", "Texas", 1310000] + ], + columns=["city", "state", "population"] + ) + # city state population + # 0 Houston Texas 2310000 + # 1 Phoenix Arizona 1630000 + # 2 San Diego California 1410000 + # 3 Dallas Texas 1310000 + ``` +- now, if we perform a merge, an inner join is performed using the common column name automatically - + ```py + teams.merge(cities) + # team city wins losses state population + # 0 Suns Phoenix 20 4 Arizona 1630000 + # 1 Mavericks Dallas 11 12 Texas 1310000 + # 2 Rockets Houston 7 16 Texas 2310000 + ``` +- we can set the **how** parameter for join type. as we saw, it is **inner** by default, but we can set it to **outer**, **left**, **right**, etc + ```py + teams.merge(cities, how='left') + + # team city wins losses state population + # 0 Suns Phoenix 20 4 Arizona 1630000.0 + # 1 Mavericks Dallas 11 12 Texas 1310000.0 + # 2 Rockets Houston 7 16 Texas 2310000.0 + # 3 Nuggets Denver 11 12 NaN NaN + ``` +- cross join is also there - all rows of one dataframe with all rows of the other dataframe +- by default, the same column name was used explicitly. we can however, specify the column(s) explicitly using the **on** keyword argument + ```py + teams.merge(cities, on='city') + ``` +- note - we can specify multiple columns for the on parameter as well based on use case +- what if the two dataframes have similar column names, and are not being used for joining? pandas will suffix them with _x and _y by default. e.g. below, the name column is being used for the join, so it is only present once. however, the score column is not, and therefore it is preset with a suffix + ```py + midterm = pd.DataFrame( + [['shameek', 42], ['colt', 45]], + columns=['name', 'score'] + ) + + final = pd.DataFrame( + [['shameek', 85], ['colt', 97]], + columns=['name', 'score'] + ) + + midterm.merge(final, on='name') + + # name score_x score_y + # 0 shameek 42 85 + # 1 colt 45 97 + ``` +- we can however, specify the **suffixes** to append - + ```py + midterm.merge(final, on='name', suffixes=['_midterm', '_final']) + + # name score_midterm score_final + # 0 shameek 42 85 + # 1 colt 45 97 + ``` +- also note how we had to specify **on** explicitly, otherwise both name and score would be used. since there is no data with the same value in both tables, we end up with an empty result set + +## Seaborn + +### Relational Plots + +- uses [matplotlib](#matplotlib) underneath, and works well with pandas +- typically imported as sns + ```py + import seaborn as sns + ``` +- to play around with seaborn, we can use any of the datasets present [here](https://github.com/mwaskom/seaborn-data) via **load dataset**. it returns the pandas dataframe + ```py + tips = sns.load_dataset('tips') + tips + + # total_bill tip sex smoker day time size + # 0 16.99 1.01 Female No Sun Dinner 2 + # 1 10.34 1.66 Male No Sun Dinner 3 + # 2 21.01 3.50 Male No Sun Dinner 3 + ``` +- note - for the default theme of sns to kick in which kind of looks good, run the following + ```py + sns.set_theme() + ``` +- for a scatterplot, we can do the following - + ```py + sns.scatterplot(tips, x='total_bill', y='tip') + ``` + ![](/assets/img/python-data-analysis/seaborn-getting-started.png) +- note - the exact above result could have been achieved without seaborn as well - + ```py + tips.plot(kind='scatter', x='total_bill', y='tip') + ``` +- but, now, look how we can simply pass **hue** for different scatter plots based on color on the same axes - + ```py + sns.scatterplot(tips, x='total_bill', y='tip', hue='sex') + ``` + ![](/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue.png) +- further, we can pass in **style** for different scatter plots based on marker on the same axes + ```py + sns.scatterplot(tips, x='total_bill', y='tip', hue='sex', style='smoker') + ``` + ![](/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue-and-style.png) +- note - if we use the same column for **hue** and **style**, the marker and color both change, thus maybe improving readability + ```py + sns.scatterplot(tips, x='total_bill', y='tip', hue='sex', style='sex') + ``` + ![](/assets/img/python-data-analysis/seaborn-scatter-plot-with-same-column-for-hue-and-style.png) +- e.g. assume tips have a size column, which represents the number of people together. we can add the **size** keyword argument, which changes the size of the marker + ```py + sns.scatterplot(tips, x='total_bill', y='tip', size='size') + ``` + ![](/assets/img/python-data-analysis/seaborn-scatter-plot-with-size.png) +- assume we have a dataset for flights like so i.e. we have 12 records per year for each of the months - + ```py + flights = sns.load_dataset('flights') + flights + + # year month passengers + # 0 1949 Jan 112 + # 1 1949 Feb 118 + # 2 1949 Mar 132 + ``` +- e.g. we try to create a lineplot below. but, we do not specify how it should plot the multiple records that it gets for a passenger in a year. it plots using the **estimator** as **mean** by default + ```py + sns.lineplot(flights, x='year', y='passengers') + ``` + ![](/assets/img/python-data-analysis/seaborn-line-plot-default.png) +- if we wanted to achieve this ourselves using matplotlib, we would have to group it and then use the aggregation function like below - + ```py + flights.groupby('year')['passengers'].mean().plot() + ``` + ![](/assets/img/python-data-analysis/seaborn-line-plot-matplotlib-equivalent.png) +- estimators are pandas functions. we can also provide a custom estimator, e.g. `sum` as so - + ```py + sns.lineplot(flights, x='year', y='passengers', estimator='sum') + ``` + ![](/assets/img/python-data-analysis/seaborn-line-plot-default-with-estimator.png) +- note how there is also a **confidence interval** that seaborn also adds to the plot. we can control its width, method, etc using **error bar**. setting it to None would remove it completely + ```py + sns.lineplot(flights, x='year', y='passengers', estimator='sum', errorbar=None) + ``` +- my understanding - seaborn has two kinds of plots - **figure level plots** and **axes level plots**. the ones we saw above - **lineplot** and **scatterplot** are **axes level plots** their corresponding **figure level plot** is **relplot** or **relational plot** + ```py + # initial + sns.scatterplot(data=tips, x='total_bill', y='tip') + + # using relplot + sns.relplot(data=tips, x='total_bill', y='tip', kind='scatter') + ``` +- but now, we can easily put different subplots / different axes on the same figure +- e.g. assume we would like to have different columns for the different values of sex + ```py + sns.relplot(data=tips, x='total_bill', y='tip', row='time', col='sex', hue='smoker') + ``` + ![](/assets/img/python-data-analysis/seaborn-relplot-introduction.png) +- a more involved example. break into - + - columns using sex + - rows using time - lunch or dinner + - different colors for smokers and non smokers + + ```py + sns.relplot(data=tips, x='total_bill', y='tip', row='time', col='sex', hue='smoker') + ``` + ![](/assets/img/python-data-analysis/seaborn-relplot-involved-example.png) +- controlling figure size for **axes level plots** - we make the figure call first + ```py + plt.figure(figsize=(4, 3)) + sns.scatterplot(data=tips, x='total_bill', y='tip') + ``` +- controlling figure size for **figure level plots** - relplot creates a figure for us bts, so we cannot call the figure ourselves. instead, we control size of each **facet** i.e. subplot using **height** and **aspect** (ratio between height and width) + ```py + sns.relplot(data=tips, x='total_bill', y='tip', row='time', col='sex', hue='smoker', height=3, aspect=2) + ``` + +### Distribution Plots + +- [**relation plots**](#relational-plots) - relation between two things x and y +- **distribution plots** - distribution of data, e.g. histogram +- histogram example - assume we try to visualize the tips dataset - + ```py + sns.histplot(data=tips, x='tip') + ``` + ![](/assets/img/python-data-analysis/seaborn-histogram-introduction.png) +- if we use **hue**, by default, they would come one on top of another. the opacity is such that they are see through - + ```py + sns.histplot(data=tips, x='tip', hue='smoker') + ``` + ![](/assets/img/python-data-analysis/seaborn-histogram-with-hue.png) +- we can configure it to be **stacked** instead of appearing one on top of another + ```py + sns.histplot(data=tips, x='tip', hue='smoker', multiple='stack') + ``` + ![](/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple.png) +- we can also set multiple to be **dodge**, so that appear one beside another. note how i also configure **bins** in this case + ```py + sns.histplot(data=tips, x='tip', hue='smoker', multiple='dodge', bins=5) + ``` + ![](/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple-dodge.png) +- finally, we can add the **kde curve** to the histogram plot as well by setting kde to true + ```py + sns.histplot(data=tips, x='tip', hue='smoker', kde=True) + ``` + ![](/assets/img/python-data-analysis/seaborn-histogram-with-kde.png) +- above, we ovrlayed the **kde curve** on top of the histogram. however, we can add a standalone **kde curve** as well. below, we try to visualize the weights of different species of penguins simultaneously + ```py + sns.kdeplot(data=penguins, x='body_mass_g', hue='species') + ``` + ![](/assets/img/python-data-analysis/seaborn-kde-introduction.png) +- finally, we can also configure the precision by **adjusting the bandwidth** + ```py + sns.kdeplot(data=penguins, x='body_mass_g', hue='species', bw_adjust=0.4) + ``` + ![](/assets/img/python-data-analysis/seaborn-kde-with-bandwidth-adjustment.png) +- **histograms** / **kde plots** are also called as **univariate distribution plots** i.e. we only look at the distribution of a single feature +- we can look at **bivariate distribution plots** as well i.e. analyze two features at once, both on x and y axis +- **kde bivariate distribution plots** - try looking for smoother curves (like the hollow i believe?) + ```py + sns.kdeplot(data=penguins, x='bill_length_mm', y='flipper_length_mm', hue='species') + ``` + ![](/assets/img/python-data-analysis/seaborn-bivariate-kde-plot.png) +- **histogram bivariate distribution plots** - try looking for the concentrated coloring (like a heat map) + ```py + sns.histplot(data=penguins, x='bill_length_mm', y='flipper_length_mm', hue='species') + ``` + ![](/assets/img/python-data-analysis/seaborn-bivariate-histogram-plot.png) +- **rugplots** - ticks along the x or y axis to show the presence of an observation + ```py + sns.rugplot(data=penguins, x='body_mass_g') + ``` + ![](/assets/img/python-data-analysis/seaborn-rugplot-basics.png) +- this is not very useful by itself. because rugplots are useful when used with other plots. e.g. below, from our scatterplot, it is difficult to find out where the majority of the values lie, so we supplement it with a rugplot + ```py + sns.scatterplot(data=diamonds, x='carat', y='price', s=2) + sns.rugplot(data=diamonds, x='carat', y='price', alpha=0.005) + ``` + ![](/assets/img/python-data-analysis/seaborn-rugplot-supplementing-scatterplot.png) +- we use **displot** for the **figure level plot** of distibution plots, no surprises here + ```py + sns.displot(data=penguins, kind='kde', x='body_mass_g', col='species', row='island', height=2, aspect=2, hue='sex') + ``` + ![](/assets/img/python-data-analysis/seaborn-displot-example.png) + +### Categorical Plots + +- **count plot** - displays count. but unlike **histograms** which typically used for numerical data, **count plots** are typically used for non numerical data + ```py + sns.countplot(data=penguins, x='species', hue='sex') + ``` + ![](/assets/img/python-data-analysis/seaborn-countplot-introduction.png) +- to achieve something similar when using matplotlib by itself, i did the following - + ```py + penguins[['species', 'sex']].value_counts().unstack('sex').plot(kind='bar') + ``` + ![](/assets/img/python-data-analysis/seaborn-countplot-simulation-using-matplotlib.png) +- issue - if we tried to make a **scatterplot** for categorical data - it would be hard to comment on the density - + ```py + sns.scatterplot(data=titanic, x='pclass', y='age') + ``` + ![](/assets/img/python-data-analysis/seaborn-scatterplot-for-categorical-data.png) +- solution 1 - we can use **stripplot** - it introduces a little bit of **jitter** to improve readability - + ```py + sns.stripplot(data=titanic, x='pclass', y='age') + ``` + ![](/assets/img/python-data-analysis/seaborn-stripplot-example.png) +- solution 2 - we can use **swarmplot** - it ensures points are **non overlapping** to improve readability. my understanding - use this only for smaller / sampled datasets, otherwise achieving this can become difficult + ```py + plt.figure(figsize=(10, 4)) + sns.swarmplot(data=titanic, x='pclass', y='age') + ``` + ![](/assets/img/python-data-analysis/seaborn-swarmplot-example.png) +- note how i had to adjust the figuresize, otherwise i get the warning - `UserWarning: 15.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.` +- **box plots** - helps visualize distribution of categorical data easily. features - + - **q1** represents the 25% value + - **q3** represents the 75% value + - we have the **median** value plotted in between + - the range between q1 to q3 is called **iqr** or **inter quartile range** + - the lines surrounding iqr are called **whiskers**. they are placed relative to q1 and q3, and default to 1.5 i believe + - finally, we have **outliers** outside these whiskers + + ```py + sns.boxplot(data=titanic, x='age') + ``` + ![](/assets/img/python-data-analysis/seaborn-boxplot.png) +- using boxplot for categorical data - + ```py + sns.boxplot(data=titanic, x='pclass', y='age', hue='sex') + ``` + ![](/assets/img/python-data-analysis/seaborn-boxplot-for-catgeories.png) +- combining boxplot and swarmplot. small reminder from [matplotlib](#matplotlib) that they go into the same figure and axes since we do not call a `plt.figure()` in between + ```py + sns.boxplot(data=penguins, y='body_mass_g', x='species') + sns.swarmplot(data=penguins, y='body_mass_g', x='species', color='black') + ``` + ![](/assets/img/python-data-analysis/seaborn-boxplot-and-swarmplot.png) +- **violin plot** - has the **box plot** at the center along with the **kde curve**. carefully look at the black line to see the median, inter quartile range and whiskers + ```py + sns.violinplot(data=titanic, x='pclass', y='age') + ``` + ![](/assets/img/python-data-analysis/seaborn-violinplot-introduction.png) +- note - if we add a hue, it creates different violin plots side by side + ```py + sns.violinplot(data=titanic, x='pclass', y='age', hue='sex') + ``` + ![](/assets/img/python-data-analysis/seaborn-violinplot-with-hue-without-split.png) +- we can however, change this behavior by providing the **split** parameter + ```py + sns.violinplot(data=titanic, x='pclass', y='age', hue='sex', split=True) + ``` + ![](/assets/img/python-data-analysis/seaborn-violinplot-with-hue-and-split.png) +- **bar plot** - again, compare the difference from [matplotib](#matplotlib), where there is no calculation - it just plots, while seaborn grouping and using an **estimator** like we saw in [**line plots**](#relational-plots) + ```py + sns.barplot(data=titanic, y='pclass', x='survived', hue='sex', estimator='sum', orient='h') + ``` + ![](/assets/img/python-data-analysis/seaborn-barplot.png) +- the black line i believe helps with approximation and thus faster plotting and calculations +- plotting the same thing using matplotlib - + ```py + titanic.groupby(['pclass', 'sex'])['survived'].sum().unstack().plot(kind='barh') + ``` + ![](/assets/img/python-data-analysis/seaborn-barplot-simulation-using-matplotlib.png) +- **categorical plot** - figure level plot, not bothering as there is nothing new diff --git a/_sass/addon/commons.scss b/_sass/addon/commons.scss new file mode 100644 index 0000000..86b4899 --- /dev/null +++ b/_sass/addon/commons.scss @@ -0,0 +1,1576 @@ +/* + The common styles +*/ + +html { + @media (prefers-color-scheme: light) { + &:not([data-mode]), + &[data-mode='light'] { + @include light-scheme; + } + + &[data-mode='dark'] { + @include dark-scheme; + } + } + + @media (prefers-color-scheme: dark) { + &:not([data-mode]), + &[data-mode='dark'] { + @include dark-scheme; + } + + &[data-mode='light'] { + @include light-scheme; + } + } + + font-size: 16px; +} + +body { + background: var(--main-bg); + padding: env(safe-area-inset-top) env(safe-area-inset-right) + env(safe-area-inset-bottom) env(safe-area-inset-left); + color: var(--text-color); + -webkit-font-smoothing: antialiased; + font-family: $font-family-base; +} + +/* --- Typography --- */ + +@for $i from 1 through 5 { + h#{$i} { + @extend %heading; + + @if $i > 1 { + @extend %section; + @extend %anchor; + } + + @if $i < 5 { + $factor: 0.18rem; + + @if $i == 1 { + $factor: 0.23rem; + } + + font-size: 1rem + (5 - $i) * $factor; + } @else { + font-size: 1rem; + } + } +} + +a { + @extend %link-color; + + text-decoration: none; +} + +img { + max-width: 100%; + height: auto; + transition: all 0.35s ease-in-out; + + &[data-src] { + &[data-lqip='true'] { + &.lazyload, + &.lazyloading { + -webkit-filter: blur(20px); + filter: blur(20px); + } + } + + &:not([data-lqip='true']) { + &.lazyload, + &.lazyloading { + background: var(--img-bg); + } + + &.lazyloaded { + -webkit-animation: fade-in 0.35s ease-in; + animation: fade-in 0.35s ease-in; + } + } + + &.shadow { + -webkit-filter: drop-shadow(2px 4px 6px rgba(0, 0, 0, 0.08)); + filter: drop-shadow(2px 4px 6px rgba(0, 0, 0, 0.08)); + box-shadow: none !important; /* cover the Bootstrap 4.6.1 styles */ + } + + @extend %img-caption; + } + + @-webkit-keyframes fade-in { + from { + opacity: 0; + } + to { + opacity: 1; + } + } + + @keyframes fade-in { + from { + opacity: 0; + } + to { + opacity: 1; + } + } +} + +blockquote { + border-left: 5px solid var(--blockquote-border-color); + padding-left: 1rem; + color: var(--blockquote-text-color); + + &[class^='prompt-'] { + border-left: 0; + position: relative; + padding: 1rem 1rem 1rem 3rem; + color: var(--prompt-text-color); + + @extend %rounded; + + &::before { + text-align: center; + width: 3rem; + position: absolute; + left: 0.25rem; + margin-top: 0.4rem; + text-rendering: auto; + -webkit-font-smoothing: antialiased; + } + + > p:last-child { + margin-bottom: 0; + } + } + + @include prompt('tip', '\f0eb', 'regular'); + @include prompt('info', '\f06a'); + @include prompt('warning', '\f06a'); + @include prompt('danger', '\f071'); +} + +kbd { + font-family: inherit; + display: inline-block; + vertical-align: middle; + line-height: 1.3rem; + min-width: 1.75rem; + text-align: center; + margin: 0 0.3rem; + padding-top: 0.1rem; + color: var(--kbd-text-color); + background-color: var(--kbd-bg-color); + border-radius: 0.25rem; + border: solid 1px var(--kbd-wrap-color); + box-shadow: inset 0 -2px 0 var(--kbd-wrap-color); +} + +footer { + font-size: 0.8rem; + background-color: var(--main-bg); + + div.d-flex { + height: $footer-height; + line-height: 1.2rem; + padding-bottom: 1rem; + border-top: 1px solid var(--main-border-color); + flex-wrap: wrap; + } + + a { + @extend %text-color; + + &:hover { + @extend %link-hover; + } + } + + p { + width: 100%; + text-align: center; + margin-bottom: 0; + } +} + +/* fontawesome icons */ +i { + &.far, + &.fas { + @extend %no-cursor; + } +} + +/* --- Panels --- */ + +.access { + top: 2rem; + transition: top 0.2s ease-in-out; + margin-top: 3rem; + margin-bottom: 4rem; + + &:only-child { + position: -webkit-sticky; + position: sticky; + } + + > div { + padding-left: 1rem; + border-left: 1px solid var(--main-border-color); + + &:not(:last-child) { + margin-bottom: 4rem; + } + } + + .post-content { + font-size: 0.9rem; + } +} + +#panel-wrapper { + /* the headings */ + .panel-heading { + @include label(inherit); + } + + .post-tag { + line-height: 1.05rem; + font-size: 0.85rem; + border: 1px solid var(--btn-border-color); + border-radius: 0.8rem; + padding: 0.3rem 0.5rem; + margin: 0 0.35rem 0.5rem 0; + + &:hover { + transition: all 0.3s ease-in; + } + } +} + +#access-lastmod { + a { + &:hover { + @extend %link-hover; + } + + @extend %no-bottom-border; + + color: inherit; + } +} + +.footnotes > ol { + padding-left: 2rem; + margin-top: 0.5rem; + + > li { + &:not(:last-child) { + margin-bottom: 0.3rem; + } + + > p { + margin-left: 0.25em; + margin-top: 0; + margin-bottom: 0; + } + } +} + +.footnote { + @at-root a#{&} { + @include ml-mr(1px); + @include pl-pr(2px); + + border-bottom-style: none !important; + transition: background-color 1.5s ease-in-out; + } +} + +.reversefootnote { + @at-root a#{&} { + font-size: 0.6rem; + line-height: 1; + position: relative; + bottom: 0.25em; + margin-left: 0.25em; + border-bottom-style: none !important; + } +} + +/* --- Begin of Markdown table style --- */ + +/* it will be created by Liquid */ +.table-wrapper { + overflow-x: auto; + margin-bottom: 1.5rem; + + > table { + min-width: 100%; + overflow-x: auto; + border-spacing: 0; + + thead { + border-bottom: solid 2px rgba(210, 215, 217, 0.75); + + th { + @extend %table-cell; + } + } + + tbody { + tr { + border-bottom: 1px solid var(--tb-border-color); + + &:nth-child(2n) { + background-color: var(--tb-even-bg); + } + + &:nth-child(2n + 1) { + background-color: var(--tb-odd-bg); + } + + td { + @extend %table-cell; + } + } + } /* tbody */ + } /* table */ +} + +/* --- post --- */ + +.post-preview { + @extend %rounded; + + border: 0; + background: var(--card-bg); + box-shadow: var(--card-shadow); + + &::before { + @extend %rounded; + + content: ''; + width: 100%; + height: 100%; + position: absolute; + background-color: var(--card-hovor-bg); + opacity: 0; + transition: opacity 0.35s ease-in-out; + } + + &:hover { + &::before { + opacity: 0.3; + } + } +} + +.post { + h1 { + margin-top: 2rem; + margin-bottom: 1.5rem; + } + + p { + > img[data-src], + > a.popup { + &:not(.normal):not(.left):not(.right) { + @include align-center; + } + } + } +} + +.post-meta { + font-size: 0.85rem; + + a { + &:not([class]):hover { + @extend %link-hover; + } + } + + em { + @extend %normal-font-style; + } +} + +.post-content { + font-size: 1.08rem; + margin-top: 2rem; + overflow-wrap: break-word; + + a { + &.popup { + @extend %no-cursor; + @extend %img-caption; + @include mt-mb(0.5rem); + + cursor: zoom-in; + } + + &:not(.img-link) { + @extend %link-underline; + + &:hover { + @extend %link-hover; + } + } + } + + ol, + ul { + &:not([class]), + &.task-list { + -webkit-padding-start: 1.75rem; + padding-inline-start: 1.75rem; + + li { + margin: 0.25rem 0; + padding-left: 0.25rem; + } + + ol, + ul { + -webkit-padding-start: 1.25rem; + padding-inline-start: 1.25rem; + margin: 0.5rem 0; + } + } + } + + ul.task-list { + -webkit-padding-start: 1.25rem; + padding-inline-start: 1.25rem; + + li { + list-style-type: none; + padding-left: 0; + + /* checkbox icon */ + > i { + width: 2rem; + margin-left: -1.25rem; + color: var(--checkbox-color); + + &.checked { + color: var(--checkbox-checked-color); + } + } + + ul { + -webkit-padding-start: 1.75rem; + padding-inline-start: 1.75rem; + } + } + + input[type='checkbox'] { + margin: 0 0.5rem 0.2rem -1.3rem; + vertical-align: middle; + } + } /* ul */ + + dl > dd { + margin-left: 1rem; + } + + ::marker { + color: var(--text-muted-color); + } +} /* .post-content */ + +.tag:hover { + @extend %tag-hover; +} + +.post-tag { + display: inline-block; + min-width: 2rem; + text-align: center; + border-radius: 0.3rem; + padding: 0 0.4rem; + color: inherit; + line-height: 1.3rem; + + &:not(:last-child) { + margin-right: 0.2rem; + } +} + +.rounded-10 { + border-radius: 10px !important; +} + +.img-link { + color: transparent; + display: inline-flex; +} + +.shimmer { + overflow: hidden; + position: relative; + background: var(--img-bg); + + &::before { + content: ''; + position: absolute; + background: var(--shimmer-bg); + height: 100%; + width: 100%; + -webkit-animation: shimmer 1s infinite; + animation: shimmer 1s infinite; + } + + @-webkit-keyframes shimmer { + 0% { + transform: translateX(-100%); + } + 100% { + transform: translateX(100%); + } + } + + @keyframes shimmer { + 0% { + transform: translateX(-100%); + } + 100% { + transform: translateX(100%); + } + } +} + +.embed-video { + width: 100%; + height: 100%; + margin-bottom: 1rem; + + @extend %rounded; + + &.youtube { + aspect-ratio: 16 / 9; + } + + &.twitch { + aspect-ratio: 310 / 189; + } +} + +/* --- buttons --- */ +.btn-lang { + border: 1px solid !important; + padding: 1px 3px; + border-radius: 3px; + color: var(--link-color); + + &:focus { + box-shadow: none; + } +} + +/* --- Effects classes --- */ + +.loaded { + display: block !important; + + @at-root .d-flex#{&} { + display: flex !important; + } +} + +.unloaded { + display: none !important; +} + +.visible { + visibility: visible !important; +} + +.hidden { + visibility: hidden !important; +} + +.flex-grow-1 { + flex-grow: 1 !important; +} + +.btn-box-shadow { + box-shadow: 0 0 8px 0 var(--btn-box-shadow) !important; +} + +/* overwrite bootstrap muted */ +.text-muted { + color: var(--text-muted-color) !important; +} + +/* Overwrite bootstrap tooltip */ +.tooltip-inner { + font-size: 0.7rem; + max-width: 220px; + text-align: left; +} + +/* Overwrite bootstrap outline button */ +.btn.btn-outline-primary { + &:not(.disabled):hover { + border-color: #007bff !important; + } +} + +.disabled { + color: rgb(206, 196, 196); + pointer-events: auto; + cursor: not-allowed; +} + +.hide-border-bottom { + border-bottom: none !important; +} + +.input-focus { + box-shadow: none; + border-color: var(--input-focus-border-color) !important; + background: center !important; + transition: background-color 0.15s ease-in-out, border-color 0.15s ease-in-out; +} + +.left { + float: left; + margin: 0.75rem 1rem 1rem 0 !important; +} + +.right { + float: right; + margin: 0.75rem 0 1rem 1rem !important; +} + +/* --- Overriding --- */ + +/* magnific-popup */ + +figure .mfp-title { + text-align: center; + padding-right: 0; + margin-top: 0.5rem; +} + +.mfp-img { + transition: none; +} + +/* mermaid */ +.mermaid { + text-align: center; +} + +/* MathJax */ +mjx-container { + overflow-y: hidden; + min-width: auto !important; +} + +/* --- sidebar layout --- */ + +$sidebar-display: 'sidebar-display'; +$btn-gap: 0.8rem; // for the bottom icons +$btn-border-width: 3px; +$btn-mb: 0.5rem; + +#sidebar { + @include pl-pr(0); + + position: fixed; + top: 0; + left: 0; + height: 100%; + overflow-y: auto; + width: $sidebar-width; + z-index: 99; + background: var(--sidebar-bg); + + /* Hide scrollbar for Chrome, Safari and Opera */ + &::-webkit-scrollbar { + display: none; + } + + /* Hide scrollbar for IE, Edge and Firefox */ + -ms-overflow-style: none; /* IE and Edge */ + scrollbar-width: none; /* Firefox */ + + %sidebar-link-hover { + &:hover { + color: var(--sidebar-active-color); + } + } + + a { + @extend %sidebar-links; + } + + #avatar { + display: block; + width: 7rem; + height: 7rem; + overflow: hidden; + box-shadow: var(--avatar-border-color) 0 0 0 2px; + transform: translateZ(0); /* fixed the zoom in Safari */ + + img { + transition: transform 0.5s; + + &:hover { + transform: scale(1.2); + } + } + } + + .profile-wrapper { + @include mt-mb(2.5rem); + @extend %clickable-transition; + + padding-left: 2.5rem; + padding-right: 1.25rem; + width: 100%; + } + + .site-title { + font-weight: 900; + font-size: 1.75rem; + line-height: 1.2; + letter-spacing: 0.25px; + color: rgba(134, 133, 133, 0.99); + margin-top: 1.25rem; + margin-bottom: 0.5rem; + + a { + @extend %clickable-transition; + @extend %sidebar-link-hover; + } + } + + .site-subtitle { + font-size: 95%; + color: var(--sidebar-muted-color); + margin-top: 0.25rem; + word-spacing: 1px; + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; + } + + ul { + margin-bottom: 2rem; + + li.nav-item { + opacity: 0.9; + width: 100%; + padding-left: 1.5rem; + padding-right: 1.5rem; + + a.nav-link { + @include pt-pb(0.6rem); + + display: flex; + align-items: center; + border-radius: 0.75rem; + font-weight: 600; + + &:hover { + background-color: var(--sidebar-hover-bg); + } + + i { + font-size: 95%; + opacity: 0.8; + margin-right: 1.5rem; + } + + span { + font-size: 90%; + letter-spacing: 0.2px; + } + } + + &.active { + .nav-link { + color: var(--sidebar-active-color); + background-color: var(--sidebar-hover-bg); + + span { + opacity: 1; + } + } + } + + &:not(:first-child) { + margin-top: 0.25rem; + } + } + } + + .sidebar-bottom { + @include pl-pr(2rem); + + margin-bottom: 1.5rem; + + %button { + width: 1.75rem; + height: 1.75rem; + margin-bottom: $btn-mb; // multi line gap + border-radius: 50%; + color: var(--sidebar-btn-color); + background-color: var(--sidebar-btn-bg); + text-align: center; + display: flex; + align-items: center; + justify-content: center; + + &:hover { + background-color: var(--sidebar-hover-bg); + } + } + + a { + @extend %button; + @extend %sidebar-link-hover; + @extend %clickable-transition; + + &:not(:last-child) { + margin-right: $btn-gap; + } + } + + i { + line-height: 1.75rem; + } + + .mode-toggle { + padding: 0; + border: 0; + + @extend %button; + @extend %sidebar-links; + @extend %sidebar-link-hover; + } + + .icon-border { + @extend %no-cursor; + @include ml-mr(calc(($btn-gap - $btn-border-width) / 2)); + + background-color: var(--sidebar-muted-color); + content: ''; + width: $btn-border-width; + height: $btn-border-width; + border-radius: 50%; + margin-bottom: $btn-mb; + } + } /* .sidebar-bottom */ +} /* #sidebar */ + +@media (hover: hover) { + #sidebar ul > li:last-child::after { + transition: top 0.5s ease; + } + + .nav-link { + transition: background-color 0.3s ease-in-out; + } + + .post-preview { + transition: background-color 0.35s ease-in-out; + } +} + +#search-result-wrapper { + display: none; + height: 100%; + width: 100%; + overflow: auto; + + .post-content { + margin-top: 2rem; + } +} + +/* --- top-bar --- */ + +#topbar-wrapper { + height: $topbar-height; + background-color: var(--topbar-bg); +} + +#topbar { + /* icons */ + i { + color: #999999; + } + + #breadcrumb { + font-size: 1rem; + color: gray; + padding-left: 0.5rem; + + a:hover { + @extend %link-hover; + } + + span { + &:not(:last-child) { + &::after { + content: '›'; + padding: 0 0.3rem; + } + } + } + } +} /* #topbar */ + +#sidebar-trigger, +#search-trigger { + display: none; +} + +#search-wrapper { + display: flex; + width: 100%; + border-radius: 1rem; + border: 1px solid var(--search-wrapper-border-color); + background: var(--main-bg); + padding: 0 0.5rem; + + i { + z-index: 2; + font-size: 0.9rem; + color: var(--search-icon-color); + } +} + +/* 'Cancel' link */ +#search-cancel { + color: var(--link-color); + margin-left: 0.75rem; + display: none; + white-space: nowrap; + + @extend %cursor-pointer; +} + +#search-input { + background: center; + border: 0; + border-radius: 0; + padding: 0.18rem 0.3rem; + color: var(--text-color); + height: auto; + + &:focus { + box-shadow: none; + + &.form-control { + &::-moz-placeholder { + @include input-placeholder; + } + &::-webkit-input-placeholder { + @include input-placeholder; + } + &:-ms-input-placeholder { + @include input-placeholder; + } + &::-ms-input-placeholder { + @include input-placeholder; + } + &::placeholder { + @include input-placeholder; + } + } + } +} + +#search-hints { + padding: 0 1rem; + + h4 { + margin-bottom: 1.5rem; + } + + .post-tag { + display: inline-block; + line-height: 1rem; + font-size: 1rem; + background: var(--search-tag-bg); + border: none; + padding: 0.5rem; + margin: 0 1.25rem 1rem 0; + + &::before { + content: '#'; + color: var(--text-muted-color); + padding-right: 0.2rem; + } + + @extend %link-color; + } +} + +#search-results { + padding-bottom: 3rem; + + a { + &:hover { + @extend %link-hover; + } + + @extend %link-color; + @extend %no-bottom-border; + @extend %heading; + + font-size: 1.4rem; + line-height: 2.5rem; + } + + > div { + width: 100%; + + &:not(:last-child) { + margin-bottom: 1rem; + } + + /* icons */ + i { + color: #818182; + margin-right: 0.15rem; + font-size: 80%; + } + + > p { + overflow: hidden; + text-overflow: ellipsis; + display: -webkit-box; + -webkit-line-clamp: 3; + -webkit-box-orient: vertical; + } + } +} /* #search-results */ + +#topbar-title { + display: none; + font-size: 1.1rem; + font-weight: 600; + font-family: sans-serif; + color: var(--topbar-text-color); + text-align: center; + width: 70%; + overflow: hidden; + text-overflow: ellipsis; + word-break: keep-all; + white-space: nowrap; +} + +#core-wrapper { + line-height: 1.75; + + .categories, + #tags, + #archives { + a:not(:hover) { + @extend %no-bottom-border; + } + } +} + +#mask { + display: none; + position: fixed; + inset: 0 0 0 0; + height: 100%; + width: 100%; + z-index: 1; + + @at-root [#{$sidebar-display}] & { + display: block !important; + } +} + +/* --- main wrapper --- */ + +#main-wrapper { + background-color: var(--main-bg); + position: relative; + min-height: calc(100vh - $footer-height-mobile); + + @include pl-pr(0); +} + +#topbar-wrapper.row, +#main > .row, +#search-result-wrapper > .row { + @include ml-mr(0); +} + +/* --- button back-to-top --- */ + +#back-to-top { + $size: 3rem; + + display: none; + z-index: 1; + cursor: pointer; + position: fixed; + right: 1rem; + bottom: 2rem; + background: var(--button-bg); + color: var(--btn-backtotop-color); + padding: 0; + width: $size; + height: $size; + border-radius: 50%; + border: 1px solid var(--btn-backtotop-border-color); + transition: transform 0.2s ease-out; + -webkit-transition: transform 0.2s ease-out; + + &:hover { + transform: translate3d(0, -5px, 0); + -webkit-transform: translate3d(0, -5px, 0); + } + + i { + line-height: $size; + position: relative; + bottom: 2px; + } +} + +#notification { + @-webkit-keyframes popup { + from { + opacity: 0; + bottom: 0; + } + } + + @keyframes popup { + from { + opacity: 0; + bottom: 0; + } + } + + .toast-header { + background: none; + border-bottom: none; + color: inherit; + } + + .toast-body { + font-family: Lato, sans-serif; + line-height: 1.25rem; + + button { + font-size: 90%; + min-width: 4rem; + } + } + + &.toast { + &.show { + display: block; + min-width: 20rem; + border-radius: 0.5rem; + -webkit-backdrop-filter: blur(10px); + backdrop-filter: blur(10px); + background-color: rgba(255, 255, 255, 0.5); + color: #1b1b1eba; + position: fixed; + left: 50%; + bottom: 20%; + transform: translateX(-50%); + -webkit-animation: popup 0.8s; + animation: popup 0.8s; + } + } +} + +/* + Responsive Design: + + {sidebar, content, panel} >= 1200px screen width + {sidebar, content} >= 850px screen width + {content} <= 849px screen width + +*/ + +@media all and (max-width: 576px) { + #main-wrapper { + min-height: calc(100vh - #{$footer-height-mobile}); + } + + #core-wrapper { + .post-content { + > blockquote[class^='prompt-'] { + @include ml-mr(-1.25rem); + + border-radius: 0; + max-width: none; + } + } + } + + #avatar { + width: 5rem; + height: 5rem; + } +} + +@media all and (max-width: 768px) { + %full-width { + max-width: 100%; + } + + #topbar { + @extend %full-width; + } + + #main { + @extend %full-width; + @include pl-pr(0); + } +} + +/* hide sidebar and panel */ +@media all and (max-width: 849px) { + @mixin slide($append: null) { + $basic: transform 0.4s ease; + + @if $append { + transition: $basic, $append; + } @else { + transition: $basic; + } + } + + html, + body { + overflow-x: hidden; + } + + footer { + @include slide; + + height: $footer-height-mobile; + + div.d-flex { + padding: 1.5rem 0; + line-height: 1.65; + flex-wrap: wrap; + } + } + + [#{$sidebar-display}] { + #sidebar { + transform: translateX(0); + } + + #main-wrapper, + footer { + transform: translateX(#{$sidebar-width}); + } + + #back-to-top { + visibility: hidden; + } + } + + #sidebar { + @include slide; + + transform: translateX(-#{$sidebar-width}); /* hide */ + -webkit-transform: translateX(-#{$sidebar-width}); + } + + #main-wrapper { + @include slide; + } + + #topbar, + #main, + footer > .container { + max-width: 100%; + } + + #search-result-wrapper { + width: 100%; + } + + #breadcrumb, + #search-wrapper { + display: none; + } + + #topbar-wrapper { + @include slide(top 0.2s ease); + + left: 0; + } + + #core-wrapper, + #panel-wrapper { + margin-top: 0; + } + + #topbar-title, + #sidebar-trigger, + #search-trigger { + display: block; + } + + #search-result-wrapper .post-content { + letter-spacing: 0; + } + + #tags { + justify-content: center !important; + } + + h1.dynamic-title { + display: none; + + ~ .post-content { + margin-top: 2.5rem; + } + } +} /* max-width: 849px */ + +/* Phone & Pad */ +@media all and (min-width: 577px) and (max-width: 1199px) { + footer .d-flex > div { + width: 312px; + } +} + +/* Sidebar is visible */ +@media all and (min-width: 850px) { + /* Solved jumping scrollbar */ + html { + overflow-y: scroll; + } + + #main-wrapper, + footer { + margin-left: $sidebar-width; + } + + #main-wrapper { + min-height: calc(100vh - $footer-height); + } + + footer { + p { + width: auto; + &:last-child { + &::before { + content: '-'; + margin: 0 0.75rem; + opacity: 0.8; + } + } + } + } + + #sidebar { + .profile-wrapper { + margin-top: 3rem; + } + } + + #search-hints { + display: none; + } + + #search-wrapper { + max-width: $search-max-width; + } + + #search-result-wrapper { + max-width: $main-content-max-width; + justify-content: start !important; + } + + .post { + h1 { + margin-top: 3rem; + } + } + + div.post-content .table-wrapper > table { + min-width: 70%; + } + + /* button 'back-to-Top' position */ + #back-to-top { + bottom: 5.5rem; + right: 5%; + } + + #topbar-title { + text-align: left; + } +} + +/* Pad horizontal */ +@media all and (min-width: 992px) and (max-width: 1199px) { + #main .col-lg-11 { + flex: 0 0 96%; + max-width: 96%; + } +} + +/* Compact icons in sidebar & panel hidden */ +@media all and (min-width: 850px) and (max-width: 1199px) { + #search-results > div { + max-width: 700px; + } + + #breadcrumb { + width: 65%; + overflow: hidden; + text-overflow: ellipsis; + word-break: keep-all; + white-space: nowrap; + } +} + +/* panel hidden */ +@media all and (max-width: 1199px) { + #panel-wrapper { + display: none; + } + + #main > div.row { + justify-content: center !important; + } +} + +/* --- desktop mode, both sidebar and panel are visible --- */ + +@media all and (min-width: 1200px) { + #back-to-top { + bottom: 6.5rem; + } + + #search-wrapper { + margin-right: 4rem; + } + + #search-input { + transition: all 0.3s ease-in-out; + } + + #search-results > div { + width: 46%; + + &:nth-child(odd) { + margin-right: 1.5rem; + } + + &:nth-child(even) { + margin-left: 1.5rem; + } + + &:last-child:nth-child(odd) { + position: relative; + right: 24.3%; + } + } + + .post-content { + font-size: 1.03rem; + } + + footer { + div.d-felx { + width: 85%; + } + } +} + +@media all and (min-width: 1400px) { + #back-to-top { + right: calc((100vw - #{$sidebar-width} - 1140px) / 2 + 3rem); + } +} + +@media all and (min-width: 1650px) { + $icon-gap: 1rem; + + #main-wrapper, + footer { + margin-left: $sidebar-width-large; + } + + #topbar-wrapper { + left: $sidebar-width-large; + } + + #search-wrapper { + margin-right: calc( + #{$main-content-max-width} * 0.25 - #{$search-max-width} - 0.75rem + ); + } + + #main, + footer > .container { + max-width: $main-content-max-width; + padding-left: 1.75rem !important; + padding-right: 1.75rem !important; + } + + #core-wrapper, + #tail-wrapper { + padding-right: 4.5rem !important; + } + + #back-to-top { + right: calc( + (100vw - #{$sidebar-width-large} - #{$main-content-max-width}) / 2 + 2rem + ); + } + + #sidebar { + width: $sidebar-width-large; + + $icon-gap: 1rem; // for the bottom icons + + .profile-wrapper { + margin-top: 3.5rem; + margin-bottom: 2.5rem; + padding-left: 3.5rem; + } + + ul { + li.nav-item { + @include pl-pr(2.75rem); + } + } + + .sidebar-bottom { + padding-left: 2.75rem; + margin-bottom: 1.75rem; + + a:not(:last-child) { + margin-right: $icon-gap; + } + + .icon-border { + @include ml-mr(calc(($icon-gap - $btn-border-width) / 2)); + } + } + } +} /* min-width: 1650px */ diff --git a/_sass/addon/module.scss b/_sass/addon/module.scss new file mode 100644 index 0000000..10e0d69 --- /dev/null +++ b/_sass/addon/module.scss @@ -0,0 +1,173 @@ +/* +* Mainly scss modules, only imported to `assets/css/main.scss` +*/ + +/* ---------- scss placeholder --------- */ + +%heading { + color: var(--heading-color); + font-weight: 400; + font-family: $font-family-heading; +} + +%section { + #core-wrapper & { + margin-top: 2.5rem; + margin-bottom: 1.25rem; + + &:focus { + outline: none; /* avoid outline in Safari */ + } + } +} + +%anchor { + .anchor { + font-size: 80%; + } + + @media (hover: hover) { + .anchor { + visibility: hidden; + opacity: 0; + transition: opacity 0.25s ease-in, visibility 0s ease-in 0.25s; + } + + &:hover { + .anchor { + visibility: visible; + opacity: 1; + transition: opacity 0.25s ease-in, visibility 0s ease-in 0s; + } + } + } +} + +%tag-hover { + background: var(--tag-hover); + transition: background 0.35s ease-in-out; +} + +%table-cell { + padding: 0.4rem 1rem; + font-size: 95%; + white-space: nowrap; +} + +%link-hover { + color: #d2603a !important; + border-bottom: 1px solid #d2603a; + text-decoration: none; +} + +%link-color { + color: var(--link-color); +} + +%link-underline { + border-bottom: 1px solid var(--link-underline-color); +} + +%clickable-transition { + transition: all 0.3s ease-in-out; +} + +%no-cursor { + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; +} + +%no-bottom-border { + border-bottom: none; +} + +%cursor-pointer { + cursor: pointer; +} + +%normal-font-style { + font-style: normal; +} + +%rounded { + border-radius: $base-radius; +} + +%img-caption { + + em { + display: block; + text-align: center; + font-style: normal; + font-size: 80%; + padding: 0; + color: #6d6c6c; + } +} + +%sidebar-links { + color: rgba(117, 117, 117, 0.9); + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; +} + +%text-clip { + display: -webkit-box; + overflow: hidden; + text-overflow: ellipsis; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; +} + +/* ---------- scss mixin --------- */ + +@mixin mt-mb($value) { + margin-top: $value; + margin-bottom: $value; +} + +@mixin ml-mr($value) { + margin-left: $value; + margin-right: $value; +} + +@mixin pt-pb($val) { + padding-top: $val; + padding-bottom: $val; +} + +@mixin pl-pr($val) { + padding-left: $val; + padding-right: $val; +} + +@mixin input-placeholder { + opacity: 0.6; +} + +@mixin label($font-size: 1rem, $font-weight: 600, $color: var(--label-color)) { + color: $color; + font-size: $font-size; + font-weight: $font-weight; +} + +@mixin align-center { + position: relative; + left: 50%; + transform: translateX(-50%); +} + +@mixin prompt($type, $fa-content, $fa-style: 'solid') { + &.prompt-#{$type} { + background-color: var(--prompt-#{$type}-bg); + + &::before { + content: $fa-content; + color: var(--prompt-#{$type}-icon-color); + font: var(--fa-font-#{$fa-style}); + } + } +} diff --git a/_sass/addon/syntax.scss b/_sass/addon/syntax.scss new file mode 100644 index 0000000..df756a7 --- /dev/null +++ b/_sass/addon/syntax.scss @@ -0,0 +1,270 @@ +/* +* The syntax highlight. +*/ + +@import 'colors/light-syntax'; +@import 'colors/dark-syntax'; + +html { + @media (prefers-color-scheme: light) { + &:not([data-mode]), + &[data-mode='light'] { + @include light-syntax; + } + + &[data-mode='dark'] { + @include dark-syntax; + } + } + + @media (prefers-color-scheme: dark) { + &:not([data-mode]), + &[data-mode='dark'] { + @include dark-syntax; + } + + &[data-mode='light'] { + @include light-syntax; + } + } +} + +/* -- code snippets -- */ + +%code-snippet-bg { + background-color: var(--highlight-bg-color); +} + +%code-snippet-padding { + padding-left: 1rem; + padding-right: 1.5rem; +} + +.highlighter-rouge { + color: var(--highlighter-rouge-color); + margin-top: 0.5rem; + margin-bottom: 1.2em; /* Override BS Inline-code style */ +} + +.highlight { + @extend %rounded; + @extend %code-snippet-bg; + + @at-root figure#{&} { + @extend %code-snippet-bg; + } + + overflow: auto; + padding-top: 0.5rem; + padding-bottom: 1rem; + + pre { + margin-bottom: 0; + font-size: $code-font-size; + line-height: 1.4rem; + word-wrap: normal; /* Fixed Safari overflow-x */ + } + + table { + td pre { + overflow: visible; /* Fixed iOS safari overflow-x */ + word-break: normal; /* Fixed iOS safari linenos code break */ + } + } + + .lineno { + padding-right: 0.5rem; + min-width: 2.2rem; + text-align: right; + color: var(--highlight-lineno-color); + -webkit-user-select: none; + -moz-user-select: none; + -o-user-select: none; + -ms-user-select: none; + user-select: none; + } +} /* .highlight */ + +code { + -webkit-hyphens: none; + -ms-hyphens: none; + hyphens: none; + + &.highlighter-rouge { + font-size: $code-font-size; + padding: 3px 5px; + word-break: break-word; + border-radius: 4px; + background-color: var(--inline-code-bg); + } + + &.filepath { + background-color: inherit; + color: var(--filepath-text-color); + font-weight: 600; + padding: 0; + } + + a > &.highlighter-rouge { + padding-bottom: 0; /* show link's underlinke */ + color: inherit; + } + + a:hover > &.highlighter-rouge { + border-bottom: none; + } + + blockquote & { + color: inherit; + } +} + +td.rouge-code { + @extend %code-snippet-padding; + + /* + Prevent some browser extends from + changing the URL string of code block. + */ + a { + color: inherit !important; + border-bottom: none !important; + pointer-events: none; + } +} + +div[class^='language-'] { + @extend %rounded; + @extend %code-snippet-bg; + + box-shadow: var(--language-border-color) 0 0 0 1px; + + .post-content > & { + @include ml-mr(-1.25rem); + + border-radius: 0; + } +} + +/* Hide line numbers for default, console, and terminal code snippets */ +div { + &.nolineno, + &.language-plaintext, + &.language-console, + &.language-terminal { + pre.lineno { + display: none; + } + + td.rouge-code { + padding-left: 1.5rem; + } + } +} + +.code-header { + @extend %no-cursor; + + $code-header-height: 2.25rem; + + display: flex; + justify-content: space-between; + align-items: center; + height: $code-header-height; + margin-left: 1rem; + margin-right: 0.5rem; + + /* the label block */ + span { + /* label icon */ + i { + font-size: 1rem; + margin-right: 0.5rem; + color: var(--code-header-icon-color); + + &.small { + font-size: 70%; + } + } + + @at-root [file] #{&} > i { + position: relative; + top: 1px; /* center the file icon */ + } + + /* label text */ + &::after { + content: attr(data-label-text); + font-size: 0.85rem; + font-weight: 600; + color: var(--code-header-text-color); + } + } + + /* clipboard */ + button { + @extend %cursor-pointer; + @extend %rounded; + + border: 1px solid transparent; + height: $code-header-height; + width: $code-header-height; + padding: 0; + background-color: inherit; + + i { + color: var(--code-header-icon-color); + } + + &[timeout] { + &:hover { + border-color: var(--clipboard-checked-color); + } + + i { + color: var(--clipboard-checked-color); + } + } + + &:focus { + outline: none; + } + + &:not([timeout]):hover { + background-color: rgba(128, 128, 128, 0.37); + + i { + color: white; + } + } + } +} + +@media all and (min-width: 576px) { + div[class^='language-'] { + .post-content > & { + @include ml-mr(0); + + border-radius: $base-radius; + } + + .code-header { + @include ml-mr(0); + + &::before { + $dot-size: 0.75rem; + $dot-margin: 0.5rem; + + content: ''; + display: inline-block; + margin-left: 1rem; + width: $dot-size; + height: $dot-size; + border-radius: 50%; + background-color: var(--code-header-muted-color); + box-shadow: ($dot-size + $dot-margin) 0 0 var(--code-header-muted-color), + ($dot-size + $dot-margin) * 2 0 0 var(--code-header-muted-color); + } + } + } +} diff --git a/_sass/addon/variables.scss b/_sass/addon/variables.scss new file mode 100644 index 0000000..0c68281 --- /dev/null +++ b/_sass/addon/variables.scss @@ -0,0 +1,27 @@ +/* + * The SCSS variables + */ + +/* sidebar */ + +$sidebar-width: 260px !default; /* the basic width */ +$sidebar-width-large: 300px !default; /* screen width: >= 1650px */ + +/* other framework sizes */ + +$topbar-height: 3rem !default; +$search-max-width: 210px !default; +$footer-height: 5rem !default; +$footer-height-mobile: 6rem !default; /* screen width: < 850px */ +$main-content-max-width: 1250px !default; +$bottom-min-height: 35rem !default; +$base-radius: 0.5rem; + +/* syntax highlight */ + +$code-font-size: 0.85rem !default; + +/* fonts */ + +$font-family-base: 'Source Sans Pro', 'Microsoft Yahei', sans-serif; +$font-family-heading: Lato, 'Microsoft Yahei', sans-serif; diff --git a/_sass/colors/dark-syntax.scss b/_sass/colors/dark-syntax.scss new file mode 100644 index 0000000..36e9651 --- /dev/null +++ b/_sass/colors/dark-syntax.scss @@ -0,0 +1,91 @@ +/* + * The syntax dark mode styles. + */ + +@mixin dark-syntax { + --language-border-color: rgba(84, 83, 83, 0.27); + --highlight-bg-color: #252525; + --highlighter-rouge-color: #de6b18; + --highlight-lineno-color: #6c6c6d; + --inline-code-bg: #272822; + --code-header-text-color: #6a6a6a; + --code-header-muted-color: rgb(60, 60, 60); + --code-header-icon-color: rgb(86, 86, 86); + --clipboard-checked-color: #2bcc2b; + --filepath-text-color: #bdbdbd; + + /* override Bootstrap */ + pre { + color: #bfbfbf; + } + + .highlight .gp { + color: #818c96; + } + + /* syntax highlight colors from https://raw.githubusercontent.com/jwarby/pygments-css/master/monokai.css */ + + .highlight pre { background-color: var(--highlight-bg-color); } + .highlight .hll { background-color: var(--highlight-bg-color); } + .highlight .c { color: #75715e; } /* Comment */ + .highlight .err { color: #960050; background-color: #1e0010; } /* Error */ + .highlight .k { color: #66d9ef; } /* Keyword */ + .highlight .l { color: #ae81ff; } /* Literal */ + .highlight .n { color: #f8f8f2; } /* Name */ + .highlight .o { color: #f92672; } /* Operator */ + .highlight .p { color: #f8f8f2; } /* Punctuation */ + .highlight .cm { color: #75715e; } /* Comment.Multiline */ + .highlight .cp { color: #75715e; } /* Comment.Preproc */ + .highlight .c1 { color: #75715e; } /* Comment.Single */ + .highlight .cs { color: #75715e; } /* Comment.Special */ + .highlight .ge { color: inherit; font-style: italic; } /* Generic.Emph */ + .highlight .gs { font-weight: bold; } /* Generic.Strong */ + .highlight .kc { color: #66d9ef; } /* Keyword.Constant */ + .highlight .kd { color: #66d9ef; } /* Keyword.Declaration */ + .highlight .kn { color: #f92672; } /* Keyword.Namespace */ + .highlight .kp { color: #66d9ef; } /* Keyword.Pseudo */ + .highlight .kr { color: #66d9ef; } /* Keyword.Reserved */ + .highlight .kt { color: #66d9ef; } /* Keyword.Type */ + .highlight .ld { color: #e6db74; } /* Literal.Date */ + .highlight .m { color: #ae81ff; } /* Literal.Number */ + .highlight .s { color: #e6db74; } /* Literal.String */ + .highlight .na { color: #a6e22e; } /* Name.Attribute */ + .highlight .nb { color: #f8f8f2; } /* Name.Builtin */ + .highlight .nc { color: #a6e22e; } /* Name.Class */ + .highlight .no { color: #66d9ef; } /* Name.Constant */ + .highlight .nd { color: #a6e22e; } /* Name.Decorator */ + .highlight .ni { color: #f8f8f2; } /* Name.Entity */ + .highlight .ne { color: #a6e22e; } /* Name.Exception */ + .highlight .nf { color: #a6e22e; } /* Name.Function */ + .highlight .nl { color: #f8f8f2; } /* Name.Label */ + .highlight .nn { color: #f8f8f2; } /* Name.Namespace */ + .highlight .nx { color: #a6e22e; } /* Name.Other */ + .highlight .py { color: #f8f8f2; } /* Name.Property */ + .highlight .nt { color: #f92672; } /* Name.Tag */ + .highlight .nv { color: #f8f8f2; } /* Name.Variable */ + .highlight .ow { color: #f92672; } /* Operator.Word */ + .highlight .w { color: #f8f8f2; } /* Text.Whitespace */ + .highlight .mf { color: #ae81ff; } /* Literal.Number.Float */ + .highlight .mh { color: #ae81ff; } /* Literal.Number.Hex */ + .highlight .mi { color: #ae81ff; } /* Literal.Number.Integer */ + .highlight .mo { color: #ae81ff; } /* Literal.Number.Oct */ + .highlight .sb { color: #e6db74; } /* Literal.String.Backtick */ + .highlight .sc { color: #e6db74; } /* Literal.String.Char */ + .highlight .sd { color: #e6db74; } /* Literal.String.Doc */ + .highlight .s2 { color: #e6db74; } /* Literal.String.Double */ + .highlight .se { color: #ae81ff; } /* Literal.String.Escape */ + .highlight .sh { color: #e6db74; } /* Literal.String.Heredoc */ + .highlight .si { color: #e6db74; } /* Literal.String.Interpol */ + .highlight .sx { color: #e6db74; } /* Literal.String.Other */ + .highlight .sr { color: #e6db74; } /* Literal.String.Regex */ + .highlight .s1 { color: #e6db74; } /* Literal.String.Single */ + .highlight .ss { color: #e6db74; } /* Literal.String.Symbol */ + .highlight .bp { color: #f8f8f2; } /* Name.Builtin.Pseudo */ + .highlight .vc { color: #f8f8f2; } /* Name.Variable.Class */ + .highlight .vg { color: #f8f8f2; } /* Name.Variable.Global */ + .highlight .vi { color: #f8f8f2; } /* Name.Variable.Instance */ + .highlight .il { color: #ae81ff; } /* Literal.Number.Integer.Long */ + .highlight .gu { color: #75715e; } /* Generic.Subheading & Diff Unified/Comment? */ + .highlight .gd { color: #f92672; background-color: #561c08; } /* Generic.Deleted & Diff Deleted */ + .highlight .gi { color: #a6e22e; background-color: #0b5858; } /* Generic.Inserted & Diff Inserted */ +} diff --git a/_sass/colors/dark-typography.scss b/_sass/colors/dark-typography.scss new file mode 100644 index 0000000..310828e --- /dev/null +++ b/_sass/colors/dark-typography.scss @@ -0,0 +1,151 @@ +/* + * The main dark mode styles + */ + +@mixin dark-scheme { + /* Framework color */ + --main-bg: rgb(27, 27, 30); + --mask-bg: rgb(68, 69, 70); + --main-border-color: rgb(44, 45, 45); + + /* Common color */ + --text-color: rgb(175, 176, 177); + --text-muted-color: rgb(107, 116, 124); + --heading-color: #cccccc; + --blockquote-border-color: rgb(66, 66, 66); + --blockquote-text-color: rgb(117, 117, 117); + --link-color: rgb(138, 180, 248); + --link-underline-color: rgb(82, 108, 150); + --button-bg: rgb(39, 40, 43); + --btn-border-color: rgb(63, 65, 68); + --btn-backtotop-color: var(--text-color); + --btn-backtotop-border-color: var(--btn-border-color); + --btn-box-shadow: var(--main-bg); + --card-header-bg: rgb(48, 48, 48); + --label-color: rgb(108, 117, 125); + --checkbox-color: rgb(118, 120, 121); + --checkbox-checked-color: var(--link-color); + --img-bg: radial-gradient(circle, rgb(22, 22, 24) 0%, rgb(32, 32, 32) 100%); + --shimmer-bg: linear-gradient( + 90deg, + rgba(255, 255, 255, 0) 0%, + rgba(58, 55, 55, 0.4) 50%, + rgba(255, 255, 255, 0) 100% + ); + + /* Sidebar */ + --sidebar-bg: radial-gradient(circle, #242424 0%, #1d1f27 100%); + --sidebar-muted-color: #6d6c6b; + --sidebar-active-color: rgb(255, 255, 255, 0.95); + --sidebar-hover-bg: rgb(54, 54, 54, 0.33); + --sidebar-btn-bg: rgb(84, 83, 83, 0.3); + --sidebar-btn-color: #787878; + --avatar-border-color: rgb(206, 206, 206, 0.9); + + /* Topbar */ + --topbar-bg: rgb(27, 27, 30, 0.64); + --topbar-text-color: var(--text-color); + --search-wrapper-border-color: rgb(55, 55, 55); + --search-icon-color: rgb(100, 102, 105); + --input-focus-border-color: rgb(112, 114, 115); + + /* Home page */ + --post-list-text-color: rgb(175, 176, 177); + --btn-patinator-text-color: var(--text-color); + --btn-paginator-hover-color: rgb(64, 65, 66); + --btn-paginator-border-color: var(--btn-border-color); + --btn-text-color: var(--text-color); + + /* Posts */ + --toc-highlight: rgb(116, 178, 243); + --tag-bg: rgb(41, 40, 40); + --tag-hover: rgb(43, 56, 62); + --tb-odd-bg: rgba(42, 47, 53, 0.52); /* odd rows of the posts' table */ + --tb-even-bg: rgb(31, 31, 34); /* even rows of the posts' table */ + --tb-border-color: var(--tb-odd-bg); + --footnote-target-bg: rgb(63, 81, 181); + --btn-share-color: #6c757d; + --btn-share-hover-color: #bfc1ca; + --relate-post-date: var(--text-muted-color); + --card-bg: #1e1e1e; + --card-hovor-bg: #464d51; + --card-shadow: rgb(21, 21, 21, 0.72) 0 6px 18px 0, + rgb(137, 135, 135, 0.24) 0 0 0 1px; + --kbd-wrap-color: #6a6a6a; + --kbd-text-color: #d3d3d3; + --kbd-bg-color: #242424; + --prompt-text-color: rgb(216, 212, 212, 0.75); + --prompt-tip-bg: rgb(22, 60, 36, 0.64); + --prompt-tip-icon-color: rgb(15, 164, 15, 0.81); + --prompt-info-bg: rgb(7, 59, 104, 0.8); + --prompt-info-icon-color: #0075d1; + --prompt-warning-bg: rgb(90, 69, 3, 0.88); + --prompt-warning-icon-color: rgb(255, 165, 0, 0.8); + --prompt-danger-bg: rgb(86, 28, 8, 0.8); + --prompt-danger-icon-color: #cd0202; + + /* tags */ + --tag-border: rgb(59, 79, 88); + --tag-shadow: rgb(32, 33, 33); + --search-tag-bg: var(--tag-bg); + --dash-color: rgb(63, 65, 68); + + /* categories */ + --categories-border: rgb(64, 66, 69, 0.5); + --categories-hover-bg: rgb(73, 75, 76); + --categories-icon-hover-color: white; + + /* archives */ + --timeline-node-bg: rgb(150, 152, 156); + --timeline-color: rgb(63, 65, 68); + --timeline-year-dot-color: var(--timeline-color); + + .light { + display: none; + } + + hr { + border-color: var(--main-border-color); + } + + /* categories */ + .categories.card, + .list-group-item { + background-color: var(--card-bg); + } + + .categories { + .card-header { + background-color: var(--card-header-bg); + } + + .list-group-item { + border-left: none; + border-right: none; + padding-left: 2rem; + border-color: var(--categories-border); + + &:last-child { + border-bottom-color: var(--card-bg); + } + } + } + + #archives li:nth-child(odd) { + background-image: linear-gradient( + to left, + rgb(26, 26, 30), + rgb(39, 39, 45), + rgb(39, 39, 45), + rgb(39, 39, 45), + rgb(26, 26, 30) + ); + } + + color-scheme: dark; + + /* stylelint-disable-next-line selector-id-pattern */ + #disqus_thread { + color-scheme: none; + } +} /* dark-scheme */ diff --git a/_sass/colors/light-syntax.scss b/_sass/colors/light-syntax.scss new file mode 100644 index 0000000..040a5f5 --- /dev/null +++ b/_sass/colors/light-syntax.scss @@ -0,0 +1,83 @@ +/* + * The syntax light mode code snippet colors. + */ + +@mixin light-syntax { + /* see: */ + .highlight .hll { background-color: #ffffcc; } + .highlight .c { color: #999988; font-style: italic; } /* Comment */ + .highlight .err { color: #a61717; background-color: #e3d2d2; } /* Error */ + .highlight .k { color: #000000; font-weight: bold; } /* Keyword */ + .highlight .o { color: #000000; font-weight: bold; } /* Operator */ + .highlight .cm { color: #999988; font-style: italic; } /* Comment.Multiline */ + .highlight .cp { color: #999999; font-weight: bold; font-style: italic; } /* Comment.Preproc */ + .highlight .c1 { color: #999988; font-style: italic; } /* Comment.Single */ + .highlight .cs { color: #999999; font-weight: bold; font-style: italic; } /* Comment.Special */ + .highlight .gd { color: #d01040; background-color: #ffdddd; } /* Generic.Deleted */ + .highlight .ge { color: #000000; font-style: italic; } /* Generic.Emph */ + .highlight .gr { color: #aa0000; } /* Generic.Error */ + .highlight .gh { color: #999999; } /* Generic.Heading */ + .highlight .gi { color: #008080; background-color: #ddffdd; } /* Generic.Inserted */ + .highlight .go { color: #888888; } /* Generic.Output */ + .highlight .gp { color: #555555; } /* Generic.Prompt */ + .highlight .gs { font-weight: bold; } /* Generic.Strong */ + .highlight .gu { color: #aaaaaa; } /* Generic.Subheading */ + .highlight .gt { color: #aa0000; } /* Generic.Traceback */ + .highlight .kc { color: #000000; font-weight: bold; } /* Keyword.Constant */ + .highlight .kd { color: #000000; font-weight: bold; } /* Keyword.Declaration */ + .highlight .kn { color: #000000; font-weight: bold; } /* Keyword.Namespace */ + .highlight .kp { color: #000000; font-weight: bold; } /* Keyword.Pseudo */ + .highlight .kr { color: #000000; font-weight: bold; } /* Keyword.Reserved */ + .highlight .kt { color: #445588; font-weight: bold; } /* Keyword.Type */ + .highlight .m { color: #009999; } /* Literal.Number */ + .highlight .s { color: #d01040; } /* Literal.String */ + .highlight .na { color: #008080; } /* Name.Attribute */ + .highlight .nb { color: #0086b3; } /* Name.Builtin */ + .highlight .nc { color: #445588; font-weight: bold; } /* Name.Class */ + .highlight .no { color: #008080; } /* Name.Constant */ + .highlight .nd { color: #3c5d5d; font-weight: bold; } /* Name.Decorator */ + .highlight .ni { color: #800080; } /* Name.Entity */ + .highlight .ne { color: #990000; font-weight: bold; } /* Name.Exception */ + .highlight .nf { color: #990000; font-weight: bold; } /* Name.Function */ + .highlight .nl { color: #990000; font-weight: bold; } /* Name.Label */ + .highlight .nn { color: #555555; } /* Name.Namespace */ + .highlight .nt { color: #000080; } /* Name.Tag */ + .highlight .nv { color: #008080; } /* Name.Variable */ + .highlight .ow { color: #000000; font-weight: bold; } /* Operator.Word */ + .highlight .w { color: #bbbbbb; } /* Text.Whitespace */ + .highlight .mf { color: #009999; } /* Literal.Number.Float */ + .highlight .mh { color: #009999; } /* Literal.Number.Hex */ + .highlight .mi { color: #009999; } /* Literal.Number.Integer */ + .highlight .mo { color: #009999; } /* Literal.Number.Oct */ + .highlight .sb { color: #d01040; } /* Literal.String.Backtick */ + .highlight .sc { color: #d01040; } /* Literal.String.Char */ + .highlight .sd { color: #d01040; } /* Literal.String.Doc */ + .highlight .s2 { color: #d01040; } /* Literal.String.Double */ + .highlight .se { color: #d01040; } /* Literal.String.Escape */ + .highlight .sh { color: #d01040; } /* Literal.String.Heredoc */ + .highlight .si { color: #d01040; } /* Literal.String.Interpol */ + .highlight .sx { color: #d01040; } /* Literal.String.Other */ + .highlight .sr { color: #009926; } /* Literal.String.Regex */ + .highlight .s1 { color: #d01040; } /* Literal.String.Single */ + .highlight .ss { color: #990073; } /* Literal.String.Symbol */ + .highlight .bp { color: #999999; } /* Name.Builtin.Pseudo */ + .highlight .vc { color: #008080; } /* Name.Variable.Class */ + .highlight .vg { color: #008080; } /* Name.Variable.Global */ + .highlight .vi { color: #008080; } /* Name.Variable.Instance */ + .highlight .il { color: #009999; } /* Literal.Number.Integer.Long */ + + /* --- custom light colors --- */ + --language-border-color: rgba(172, 169, 169, 0.2); + --highlight-bg-color: #f7f7f7; + --highlighter-rouge-color: #3f596f; + --highlight-lineno-color: #c2c6cc; + --inline-code-bg: #f6f6f7; + --code-header-text-color: #a3a3b1; + --code-header-muted-color: #ebebeb; + --code-header-icon-color: #d1d1d1; + --clipboard-checked-color: #43c743; + + [class^='prompt-'] { + --inline-code-bg: #fbfafa; + } +} /* light-syntax */ diff --git a/_sass/colors/light-typography.scss b/_sass/colors/light-typography.scss new file mode 100644 index 0000000..9fc8162 --- /dev/null +++ b/_sass/colors/light-typography.scss @@ -0,0 +1,109 @@ +/* + * The syntax light mode typography colors + */ + +@mixin light-scheme { + /* Framework color */ + --main-bg: white; + --mask-bg: #c1c3c5; + --main-border-color: #f3f3f3; + + /* Common color */ + --text-color: #34343c; + --text-muted-color: #8e8e8e; + --heading-color: black; + --blockquote-border-color: #eeeeee; + --blockquote-text-color: #9a9a9a; + --link-color: #0153ab; + --link-underline-color: #dee2e6; + --button-bg: #ffffff; + --btn-border-color: #e9ecef; + --btn-backtotop-color: #686868; + --btn-backtotop-border-color: #f1f1f1; + --btn-box-shadow: #eaeaea; + --checkbox-color: #c5c5c5; + --checkbox-checked-color: #07a8f7; + --img-bg: radial-gradient( + circle, + rgb(255, 255, 255) 0%, + rgb(239, 239, 239) 100% + ); + --shimmer-bg: linear-gradient( + 90deg, + rgba(250, 250, 250, 0) 0%, + rgba(232, 230, 230, 1) 50%, + rgba(250, 250, 250, 0) 100% + ); + + /* Sidebar */ + --sidebar-bg: #f6f8fa; + --sidebar-muted-color: #a2a19f; + --sidebar-active-color: #1d1d1d; + --sidebar-hover-bg: rgb(223, 233, 241, 0.64); + --sidebar-btn-bg: white; + --sidebar-btn-color: #8e8e8e; + --avatar-border-color: white; + + /* Topbar */ + --topbar-bg: rgb(255, 255, 255, 0.7); + --topbar-text-color: rgb(78, 78, 78); + --search-wrapper-border-color: rgb(240, 240, 240); + --search-tag-bg: #f8f9fa; + --search-icon-color: #c2c6cc; + --input-focus-border-color: #b8b8b8; + + /* Home page */ + --post-list-text-color: dimgray; + --btn-patinator-text-color: #555555; + --btn-paginator-hover-color: var(--sidebar-bg); + --btn-paginator-border-color: var(--sidebar-bg); + --btn-text-color: #676666; + + /* Posts */ + --toc-highlight: #563d7c; + --btn-share-hover-color: var(--link-color); + --card-bg: white; + --card-hovor-bg: #e2e2e2; + --card-shadow: rgb(104, 104, 104, 0.05) 0 2px 6px 0, + rgba(211, 209, 209, 0.15) 0 0 0 1px; + --label-color: #616161; + --relate-post-date: rgba(30, 55, 70, 0.4); + --footnote-target-bg: lightcyan; + --tag-bg: rgba(0, 0, 0, 0.075); + --tag-border: #dee2e6; + --tag-shadow: var(--btn-border-color); + --tag-hover: rgb(222, 226, 230); + --tb-odd-bg: #fbfcfd; + --tb-border-color: #eaeaea; + --dash-color: silver; + --kbd-wrap-color: #bdbdbd; + --kbd-text-color: var(--text-color); + --kbd-bg-color: white; + --prompt-text-color: rgb(46, 46, 46, 0.77); + --prompt-tip-bg: rgb(123, 247, 144, 0.2); + --prompt-tip-icon-color: #03b303; + --prompt-info-bg: #e1f5fe; + --prompt-info-icon-color: #0070cb; + --prompt-warning-bg: rgb(255, 243, 205); + --prompt-warning-icon-color: #ef9c03; + --prompt-danger-bg: rgb(248, 215, 218, 0.56); + --prompt-danger-icon-color: #df3c30; + + [class^='prompt-'] { + --link-underline-color: rgb(219, 216, 216); + } + + .dark { + display: none; + } + + /* Categories */ + --categories-border: rgba(0, 0, 0, 0.125); + --categories-hover-bg: var(--btn-border-color); + --categories-icon-hover-color: darkslategray; + + /* Archive */ + --timeline-color: rgba(0, 0, 0, 0.075); + --timeline-node-bg: #c2c6cc; + --timeline-year-dot-color: #ffffff; +} /* light-scheme */ diff --git a/_sass/jekyll-theme-chirpy.scss b/_sass/jekyll-theme-chirpy.scss new file mode 100644 index 0000000..83cf7e3 --- /dev/null +++ b/_sass/jekyll-theme-chirpy.scss @@ -0,0 +1,19 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy) + * © 2019 Cotes Chung + * MIT Licensed + */ + +@import 'colors/light-typography'; +@import 'colors/dark-typography'; +@import 'addon/variables'; +@import 'variables-hook'; +@import 'addon/module'; +@import 'addon/syntax'; +@import 'addon/commons'; +@import 'layout/home'; +@import 'layout/post'; +@import 'layout/tags'; +@import 'layout/archives'; +@import 'layout/categories'; +@import 'layout/category-tag'; diff --git a/_sass/layout/archives.scss b/_sass/layout/archives.scss new file mode 100644 index 0000000..3a2e86b --- /dev/null +++ b/_sass/layout/archives.scss @@ -0,0 +1,144 @@ +/* + Style for Archives +*/ + +#archives { + letter-spacing: 0.03rem; + + $timeline-width: 4px; + + %timeline { + content: ''; + width: $timeline-width; + position: relative; + float: left; + background-color: var(--timeline-color); + } + + .year { + height: 3.5rem; + font-size: 1.5rem; + position: relative; + left: 2px; + margin-left: -$timeline-width; + + &::before { + @extend %timeline; + + height: 72px; + left: 79px; + bottom: 16px; + } + + &:first-child::before { + @extend %timeline; + + height: 32px; + top: 24px; + } + + /* Year dot */ + &::after { + content: ''; + display: inline-block; + position: relative; + border-radius: 50%; + width: 12px; + height: 12px; + left: 21.5px; + border: 3px solid; + background-color: var(--timeline-year-dot-color); + border-color: var(--timeline-node-bg); + box-shadow: 0 0 2px 0 #c2c6cc; + z-index: 1; + } + } + + ul { + li { + font-size: 1.1rem; + line-height: 3rem; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + + &:nth-child(odd) { + background-color: var(--main-bg, #ffffff); + background-image: linear-gradient( + to left, + #ffffff, + #fbfbfb, + #fbfbfb, + #fbfbfb, + #ffffff + ); + } + + &::before { + @extend %timeline; + + top: 0; + left: 77px; + height: 3.1rem; + } + } + + &:last-child li:last-child::before { + height: 1.5rem; + } + } /* #archives ul */ + + .date { + white-space: nowrap; + display: inline-block; + position: relative; + right: 0.5rem; + + &.month { + width: 1.4rem; + text-align: center; + } + + &.day { + font-size: 85%; + font-family: Lato, sans-serif; + } + } + + a { + /* post title in Archvies */ + margin-left: 2.5rem; + position: relative; + top: 0.1rem; + + &:hover { + border-bottom: none; + } + + &::before { + /* the dot before post title */ + content: ''; + display: inline-block; + position: relative; + border-radius: 50%; + width: 8px; + height: 8px; + float: left; + top: 1.35rem; + left: 71px; + background-color: var(--timeline-node-bg); + box-shadow: 0 0 3px 0 #c2c6cc; + z-index: 1; + } + } +} /* #archives */ + +@media all and (max-width: 576px) { + #archives { + margin-top: -1rem; + + ul { + letter-spacing: 0; + } + } +} diff --git a/_sass/layout/categories.scss b/_sass/layout/categories.scss new file mode 100644 index 0000000..330d3d3 --- /dev/null +++ b/_sass/layout/categories.scss @@ -0,0 +1,83 @@ +/* + Style for Tab Categories +*/ + +%category-icon-color { + color: gray; +} + +.categories { + margin-bottom: 2rem; + border-color: var(--categories-border); + + &.card, + .list-group { + @extend %rounded; + } + + .card-header { + $radius: calc($base-radius - 1px); + + padding: 0.75rem; + border-radius: $radius; + border-bottom: 0; + + &.hide-border-bottom { + border-bottom-left-radius: 0; + border-bottom-right-radius: 0; + } + } + + i { + @extend %category-icon-color; + + font-size: 86%; /* fontawesome icons */ + } + + .list-group-item { + border-left: none; + border-right: none; + padding-left: 2rem; + + &:first-child { + border-top-left-radius: 0; + border-top-right-radius: 0; + } + + &:last-child { + border-bottom: 0; + } + } +} /* .categories */ + +.category-trigger { + width: 1.7rem; + height: 1.7rem; + border-radius: 50%; + text-align: center; + color: #6c757d !important; + + i { + position: relative; + height: 0.7rem; + width: 1rem; + transition: transform 300ms ease; + } + + &:hover { + i { + color: var(--categories-icon-hover-color); + } + } +} + +/* only works on desktop */ +@media (hover: hover) { + .category-trigger:hover { + background-color: var(--categories-hover-bg); + } +} + +.rotate { + transform: rotate(-90deg); +} diff --git a/_sass/layout/category-tag.scss b/_sass/layout/category-tag.scss new file mode 100644 index 0000000..3b25db5 --- /dev/null +++ b/_sass/layout/category-tag.scss @@ -0,0 +1,77 @@ +/* + Style for page Category and Tag +*/ + +.dash { + margin: 0 0.5rem 0.6rem 0.5rem; + border-bottom: 2px dotted var(--dash-color); +} + +#page-category, +#page-tag { + ul > li { + line-height: 1.5rem; + padding: 0.6rem 0; + + /* dot */ + &::before { + background: #999999; + width: 5px; + height: 5px; + border-radius: 50%; + display: block; + content: ''; + position: relative; + top: 0.6rem; + margin-right: 0.5rem; + } + + /* post's title */ + > a { + @extend %no-bottom-border; + + font-size: 1.1rem; + } + + /* post's date */ + > span:last-child { + white-space: nowrap; + } + } +} + +/* tag icon */ +#page-tag h1 > i { + font-size: 1.2rem; +} + +#page-category h1 > i { + font-size: 1.25rem; +} + +#page-category, +#page-tag, +#access-lastmod { + a:hover { + @extend %link-hover; + + margin-bottom: -1px; /* Avoid jumping */ + } +} + +@media all and (max-width: 576px) { + #page-category, + #page-tag { + ul > li { + &::before { + margin: 0 0.5rem; + } + + > a { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + } + } + } +} diff --git a/_sass/layout/home.scss b/_sass/layout/home.scss new file mode 100644 index 0000000..499de47 --- /dev/null +++ b/_sass/layout/home.scss @@ -0,0 +1,219 @@ +/* + Style for Homepage +*/ + +#post-list { + margin-top: 2rem; + + a.card-wrapper { + display: block; + + &:hover { + text-decoration: none; + } + + &:not(:last-child) { + margin-bottom: 1.25rem; + } + } + + .card { + %img-radius { + border-radius: $base-radius $base-radius 0 0; + } + + .preview-img { + height: 10rem; + + @extend %img-radius; + + img { + width: 100%; + height: 100%; + -o-object-fit: cover; + object-fit: cover; + + @extend %img-radius; + } + } + + .card-body { + min-height: 10.5rem; + padding: 1rem; + + .card-title { + @extend %text-clip; + + font-size: 1.25rem; + } + + %muted { + color: var(--text-muted-color) !important; + } + + .card-text.post-content { + @extend %muted; + + p { + @extend %text-clip; + + line-height: 1.5; + margin: 0; + } + } + + .post-meta { + @extend %muted; + + i { + &:not(:first-child) { + margin-left: 1.5rem; + } + } + + em { + @extend %normal-font-style; + + color: inherit; + } + + > div:first-child { + display: block; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + } + } + } + } +} /* #post-list */ + +.pagination { + color: var(--btn-patinator-text-color); + font-family: Lato, sans-serif; + + a:hover { + text-decoration: none; + } + + .page-item { + .page-link { + color: inherit; + width: 2.5rem; + height: 2.5rem; + padding: 0; + display: -webkit-box; + -webkit-box-pack: center; + -webkit-box-align: center; + border-radius: 50%; + border: 1px solid var(--btn-paginator-border-color); + background-color: var(--button-bg); + + &:hover { + background-color: var(--btn-paginator-hover-color); + } + } + + &.active { + .page-link { + background-color: var(--btn-paginator-hover-color); + color: var(--btn-text-color); + } + } + + &.disabled { + cursor: not-allowed; + + .page-link { + color: rgba(108, 117, 125, 0.57); + border-color: var(--btn-paginator-border-color); + background-color: var(--button-bg); + } + } + + &:first-child .page-link, + &:last-child .page-link { + border-radius: 50%; + } + } /* .page-item */ +} /* .pagination */ + +/* Tablet */ +@media all and (min-width: 768px) { + #post-list { + %img-radius { + border-radius: 0 $base-radius $base-radius 0; + } + + .card { + .preview-img { + width: 20rem; + height: 11.55rem; // can hold 2 lines each for title and content + } + + .card-body { + min-height: 10.75rem; + width: 60%; + padding: 1.75rem 1.75rem 1.25rem 1.75rem; + + .card-text { + display: inherit !important; + } + + .post-meta { + i { + &:not(:first-child) { + margin-left: 1.75rem; + } + } + } + } + } + } +} + +/* Hide SideBar and TOC */ +@media all and (max-width: 830px) { + .pagination { + justify-content: space-evenly; + + .page-item { + &:not(:first-child):not(:last-child) { + display: none; + } + } + } +} + +/* Sidebar is visible */ +@media all and (min-width: 831px) { + #post-list { + margin-top: 2.5rem; + } + + .pagination { + font-size: 0.85rem; + + .page-item { + &:not(:last-child) { + margin-right: 0.7rem; + } + + .page-link { + width: 2rem; + height: 2rem; + } + } + + .page-index { + display: none; + } + } /* .pagination */ +} + +/* Panel is visible */ +@media all and (min-width: 1200px) { + #post-list { + padding-right: 0.5rem; + } +} diff --git a/_sass/layout/post.scss b/_sass/layout/post.scss new file mode 100644 index 0000000..3d01b4d --- /dev/null +++ b/_sass/layout/post.scss @@ -0,0 +1,417 @@ +/* + Post-specific style +*/ + +@mixin btn-sharing-color($light-color, $important: false) { + @if $important { + color: var(--btn-share-color, $light-color) !important; + } @else { + color: var(--btn-share-color, $light-color); + } +} + +%btn-post-nav { + width: 50%; + position: relative; + border-color: var(--btn-border-color); +} + +@mixin dot($pl: 0.25rem, $pr: 0.25rem) { + content: '\2022'; + padding-left: $pl; + padding-right: $pr; +} + +%text-color { + color: var(--text-color); +} + +.preview-img { + overflow: hidden; + aspect-ratio: 40 / 21; + + @extend %rounded; + + &:not(.no-bg) { + img.lazyloaded { + background: var(--img-bg); + } + } + + img { + -o-object-fit: cover; + object-fit: cover; + + @extend %rounded; + } +} + +h1 + .post-meta { + span + span::before { + @include dot; + } + + em { + @extend %text-color; + + a { + @extend %text-color; + } + } +} + +.post-tail-wrapper { + margin-top: 6rem; + border-bottom: 1px double var(--main-border-color); + font-size: 0.85rem; + + .post-tail-bottom a { + color: inherit; + } + + .license-wrapper { + line-height: 1.2rem; + + > a { + color: var(--text-color); + + &:hover { + @extend %link-hover; + } + } + + span:last-child { + font-size: 0.85rem; + } + } /* .license-wrapper */ + + .post-meta a:not(:hover) { + @extend %link-underline; + } + + .share-wrapper { + vertical-align: middle; + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; + + .share-icons { + font-size: 1.2rem; + + > i { + position: relative; + bottom: 1px; + + @extend %cursor-pointer; + + &:hover { + @extend %btn-share-hovor; + } + } + + a { + &:not(:last-child) { + margin-right: 0.25rem; + } + + &:hover { + text-decoration: none; + + > i { + @extend %btn-share-hovor; + } + } + } + + .fab { + &.fa-twitter { + @include btn-sharing-color(rgba(29, 161, 242, 1)); + } + + &.fa-facebook-square { + @include btn-sharing-color(rgb(66, 95, 156)); + } + + &.fa-telegram { + @include btn-sharing-color(rgb(39, 159, 217)); + } + + &.fa-linkedin { + @include btn-sharing-color(rgb(0, 119, 181)); + } + + &.fa-weibo { + @include btn-sharing-color(rgb(229, 20, 43)); + } + } + } /* .share-icons */ + + .fas.fa-link { + @include btn-sharing-color(rgb(171, 171, 171)); + } + } /* .share-wrapper */ +} + +.post-tags { + line-height: 2rem; + + .post-tag { + background: var(--tag-bg); + + &:hover { + @extend %link-hover; + @extend %tag-hover; + @extend %no-bottom-border; + } + } +} + +.post-navigation { + padding-top: 3rem; + padding-bottom: 4rem; + + .btn { + @extend %btn-post-nav; + + &:not(:hover) { + color: var(--link-color); + } + + &:hover { + &:not(.disabled)::before { + color: whitesmoke; + } + } + + &.disabled { + @extend %btn-post-nav; + + pointer-events: auto; + cursor: not-allowed; + background: none; + color: gray; + } + + &.btn-outline-primary.disabled:focus { + box-shadow: none; + } + + &::before { + color: var(--text-muted-color); + font-size: 0.65rem; + text-transform: uppercase; + content: attr(prompt); + } + + &:first-child { + border-radius: $base-radius 0 0 $base-radius; + left: 0.5px; + } + + &:last-child { + border-radius: 0 $base-radius $base-radius 0; + right: 0.5px; + } + } + + p { + font-size: 1.1rem; + line-height: 1.5rem; + margin-top: 0.3rem; + white-space: normal; + } +} /* .post-navigation */ + +@media (hover: hover) { + .post-navigation { + .btn, + .btn::before { + transition: all 0.35s ease-in-out; + } + } +} + +@-webkit-keyframes fade-up { + from { + opacity: 0; + position: relative; + top: 2rem; + } + + to { + opacity: 1; + position: relative; + top: 0; + } +} + +@keyframes fade-up { + from { + opacity: 0; + position: relative; + top: 2rem; + } + + to { + opacity: 1; + position: relative; + top: 0; + } +} + +#toc-wrapper { + border-left: 1px solid rgba(158, 158, 158, 0.17); + position: -webkit-sticky; + position: sticky; + top: 4rem; + transition: top 0.2s ease-in-out; + -webkit-animation: fade-up 0.8s; + animation: fade-up 0.8s; + + ul { + list-style: none; + font-size: 0.85rem; + line-height: 1.25; + padding-left: 0; + + li { + &:not(:last-child) { + margin: 0.4rem 0; + } + + a { + padding: 0.2rem 0 0.2rem 1.25rem; + } + } + + /* Overwrite TOC plugin style */ + + .toc-link { + display: block; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + + &:hover { + color: var(--toc-highlight); + text-decoration: none; + } + + &::before { + display: none; + } + } + + .is-active-link { + color: var(--toc-highlight) !important; + font-weight: 600; + + &::before { + display: inline-block; + width: 1px; + left: -1px; + height: 1.25rem; + background-color: var(--toc-highlight) !important; + } + } + + ul { + a { + padding-left: 2rem; + } + } + } +} + +/* --- Related Posts --- */ + +#related-posts { + > h3 { + @include label(1.1rem, 600); + } + + em { + @extend %normal-font-style; + + color: var(--relate-post-date); + } + + p { + font-size: 0.9rem; + margin-bottom: 0.5rem; + overflow: hidden; + text-overflow: ellipsis; + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; + } + + .card { + h4 { + @extend %text-color; + @extend %text-clip; + } + } +} + +#tail-wrapper { + min-height: 2rem; + + > div:last-of-type { + margin-bottom: 2rem; + } + + /* stylelint-disable-next-line selector-id-pattern */ + #disqus_thread { + min-height: 8.5rem; + } +} + +%btn-share-hovor { + color: var(--btn-share-hover-color) !important; +} + +.share-label { + @include label(inherit, 400, inherit); + + &::after { + content: ':'; + } +} + +@media all and (max-width: 576px) { + .preview-img[data-src] { + margin-top: 2.2rem; + } + + .post-tail-bottom { + flex-wrap: wrap-reverse !important; + + > div:first-child { + width: 100%; + margin-top: 1rem; + } + } +} + +@media all and (max-width: 768px) { + .post-content > p > img { + max-width: calc(100% + 1rem); + } +} + +/* Hide SideBar and TOC */ +@media all and (max-width: 849px) { + .post-navigation { + padding-left: 0; + padding-right: 0; + margin-left: -0.5rem; + margin-right: -0.5rem; + } + + .preview-img[data-src] { + max-width: 100vw; + border-radius: 0; + } +} diff --git a/_sass/layout/tags.scss b/_sass/layout/tags.scss new file mode 100644 index 0000000..4cf5d3b --- /dev/null +++ b/_sass/layout/tags.scss @@ -0,0 +1,19 @@ +/* + Styles for Tab Tags +*/ + +.tag { + border-radius: 0.7em; + padding: 6px 8px 7px; + margin-right: 0.8rem; + line-height: 3rem; + letter-spacing: 0; + border: 1px solid var(--tag-border) !important; + box-shadow: 0 0 3px 0 var(--tag-shadow); + + span { + margin-left: 0.6em; + font-size: 0.7em; + font-family: Oswald, sans-serif; + } +} diff --git a/_sass/variables-hook.scss b/_sass/variables-hook.scss new file mode 100644 index 0000000..f27e0eb --- /dev/null +++ b/_sass/variables-hook.scss @@ -0,0 +1,3 @@ +/* + Appending custom SCSS variables will override the default ones in `_sass/addon/variables.scsss` +*/ diff --git a/_tabs/projects.md b/_tabs/projects.md new file mode 100644 index 0000000..826d49b --- /dev/null +++ b/_tabs/projects.md @@ -0,0 +1,5 @@ +--- +# the default layout is 'page' +icon: fas fa-diagram-project +order: 3 +--- diff --git a/_tabs/resume.md b/_tabs/resume.md new file mode 100644 index 0000000..5f94c5c --- /dev/null +++ b/_tabs/resume.md @@ -0,0 +1,11 @@ +--- +# the default layout is 'page' +icon: fas fa-file +order: 2 +--- + +
      + +
      diff --git a/assets/404.html b/assets/404.html new file mode 100644 index 0000000..0337bac --- /dev/null +++ b/assets/404.html @@ -0,0 +1,16 @@ +--- +layout: page +title: "404: Page not found" +permalink: /404.html + +redirect_from: + - /norobots/ + - /assets/ + - /posts/ +--- + +{% include lang.html %} + +
      +

      {{ site.data.locales[lang].not_found.statment }}

      +
      diff --git a/assets/AWS Certified Solutions Architect - Associate.pdf b/assets/AWS Certified Solutions Architect - Associate.pdf new file mode 100644 index 0000000..9404238 Binary files /dev/null and b/assets/AWS Certified Solutions Architect - Associate.pdf differ diff --git a/assets/CKA: Certified Kubernetes Administrator.pdf b/assets/CKA: Certified Kubernetes Administrator.pdf new file mode 100644 index 0000000..285a6ec Binary files /dev/null and b/assets/CKA: Certified Kubernetes Administrator.pdf differ diff --git a/assets/Shameek-Agarwal-Resume.pdf b/assets/Shameek-Agarwal-Resume.pdf new file mode 100644 index 0000000..0ff9a34 Binary files /dev/null and b/assets/Shameek-Agarwal-Resume.pdf differ diff --git a/assets/css/style.scss b/assets/css/style.scss new file mode 100644 index 0000000..a8805f4 --- /dev/null +++ b/assets/css/style.scss @@ -0,0 +1,6 @@ +--- +--- + +@import '{{ site.theme }}'; + +/* append your custom style below */ diff --git a/assets/feed.xml b/assets/feed.xml new file mode 100644 index 0000000..a244a56 --- /dev/null +++ b/assets/feed.xml @@ -0,0 +1,61 @@ +--- +layout: compress +permalink: /feed.xml +# Atom Feed, reference: https://validator.w3.org/feed/docs/atom.html +--- + +{% capture source %} + + {{ "/" | absolute_url }} + {{ site.title }} + {{ site.description }} + {{ site.time | date_to_xmlschema }} + + {{ site.social.name }} + {{ "/" | absolute_url }} + + + + Jekyll + © {{ 'now' | date: '%Y' }} {{ site.social.name }} + {{ site.baseurl }}/assets/img/favicons/favicon.ico + {{ site.baseurl }}/assets/img/favicons/favicon-96x96.png + +{% for post in site.posts limit: 5 %} + {% assign post_absolute_url = post.url | absolute_url %} + + {{ post.title }} + + {{ post.date | date_to_xmlschema }} + {% if post.last_modified_at %} + {{ post.last_modified_at | date_to_xmlschema }} + {% else %} + {{ post.date | date_to_xmlschema }} + {% endif %} + {{ post_absolute_url }} + + + {{ post.author | default: site.social.name }} + + + {% if post.categories %} + {% for category in post.categories %} + + {% endfor %} + {% endif %} + + {% if post.summary %} + {{ post.summary | strip }} + {% else %} + + {% include no-linenos.html content=post.content %} + {{ content | strip_html | truncate: 400 }} + + {% endif %} + + +{% endfor %} + +{% endcapture %} +{{ source | replace: '&', '&' }} diff --git a/assets/img/docker-and-kubernetes/docker-vs-vm.drawio b/assets/img/docker-and-kubernetes/docker-vs-vm.drawio new file mode 100644 index 0000000..43b9d1e --- /dev/null +++ b/assets/img/docker-and-kubernetes/docker-vs-vm.drawio @@ -0,0 +1 @@ +5Zpdb5swFIZ/TS4ngd2Q5LJN03YXVbVF2rRLFxywZmxkTBP263dYDAmYqllUCg03kTk2OH5e+3yETPAy3t0rkkSPMqB8gpxgN8G3E4Tm3hw+C0O+N0xdb28IFQv2JvdgWLM/1BgdY81YQNPaQC0l1yypG30pBPV1zUaUktv6sI3k9VkTElLLsPYJt60/WaAjsyw0O9gfKAujcmbXW+x7YlIONitJIxLI7ZEJryZ4qaTU+1a8W1JesCu57O+7e6W3+mKKCn3KDcHT/TeZ5Rv9dDdDYR59Z+TXF/OUF8Izs+AJ8jg87yZNiIB2WLS/io0iqVaZrzNFyxEw1fEgs0qdl+iUzERAi9kd6N5GTNN1Qvyidwt7BWyRjjlcudXdL1Rpunt1gW6FDbYblTHVKoch5oYrA9rsNDQ319uDbq5nbNGRZuV9xGyVsHrygSY0DND/gOudBPcpoYpoJkIYus5TTePPAhj1DXhuAX7IgeYLS6UaGq2KTG+0Spd6EYe9Ot2DOe0uasE7NEq9H1kXW5TaHGCDGxDQdTiwQeVvupQcTjq+FVLAyJsN47xhIpyFAi59IEbBflPwZBDfr01HzIKgmKZVjbpe7yHIoinI1BbkqkUQ3JkgVyd5haUUmjABCJGzEiG0PotnGIDjbUsEeqY0rUNq8Z5tiLzOEM0sRCRJLtcPnIcfdYbfzqSeGRxpeJrD2fNodHCdvoVYWEKEGU01mGQ6Hhla0pQPlQHZdfGPx6H57Ja49rGQBpjwunhYcQ3Z6e5Fx7Uz+Xe3R+3sdhyBrSlE75GtLHRGFtksHXoPbXYxMoDQ1sTUe2yz65HeGTV/Ozj1SHdW1yK7arjo4HauAN3tUrtaGEd0s5To26tiu2A4+sVsYI6j76QMD7BwwLOBOVc8ssrhXAG626VjLR0sJXp3rnbKOhzn2qTVu3O1M9civ3eulV+wMC91L3XvNsTA2Fajciwf8noN2zny8au0caiCFtO3ZcHvIwtcHv5T9a/v6I9pePUX \ No newline at end of file diff --git a/assets/img/docker-and-kubernetes/docker-vs-vm.drawio.png b/assets/img/docker-and-kubernetes/docker-vs-vm.drawio.png new file mode 100644 index 0000000..5bbd5a3 Binary files /dev/null and b/assets/img/docker-and-kubernetes/docker-vs-vm.drawio.png differ diff --git a/assets/img/docker-and-kubernetes/networks.drawio b/assets/img/docker-and-kubernetes/networks.drawio new file mode 100644 index 0000000..2bff8ce --- /dev/null +++ b/assets/img/docker-and-kubernetes/networks.drawio @@ -0,0 +1 @@ +3ZbdjpswEIWfhsuusElIctnQbFeqKm0VVb12sQNWjI2MKaFP33EwEC9s1UpNhVaRIvvM+O87AziIkuLyUZMy/6woEwEO6SWIPgQYb+Mt/Fuh7YQ1ijsh05x2EhqFI//JnBg6teaUVV6iUUoYXvpiqqRkqfE0orVq/LSTEv6qJcnYRDimREzVb5ya3B0Lb0b9ifEs71dG8a6LFKRPdiepckJVcyNFhyBKtFKmaxWXhAnLrufSjXt8JTpsTDNp/mQATb4envnzU9a0fPdllXw/40/v3Cw/iKjdgd1mTdsT0KqWlNlJwiDaNzk37FiS1EYbsBy03BQCegiabjqmDbu8uk80nB6qhqmCGd1Cihuwcrxav9uM9FGPNL8lv3EicY5nw8wjFGg4Ln/BCE8YofABwQ8WfMSrCTCYEqoTOnuwvLRiKlRN/wu8tQ9vPQMvnIF3N3bR8uoLh0srsNV8geElFtjwYl5Kha2n8Hb4AcVb+4iiCTs4ufGhVEarM0uUUBoUqaQFe+JCvJCI4Jm0rIEUA31vOXL4Urx3gYJTapeZdcGv8n9gxMb3YfDlxoh4xofoXj7Ev/Nh+tS/GR8GoksxYjMxwuTMDrK4JLHjJTON0mcrVterCrUJ6uSFJClYZeFBSmIn5NrU1+FVw02adyoz6dv19sXXFEX3e9lBd7wKXmM39+no8As= \ No newline at end of file diff --git a/assets/img/docker-and-kubernetes/networks.drawio.png b/assets/img/docker-and-kubernetes/networks.drawio.png new file mode 100644 index 0000000..ee70b6a Binary files /dev/null and b/assets/img/docker-and-kubernetes/networks.drawio.png differ diff --git a/assets/img/docker-and-kubernetes/pod-creation.drawio b/assets/img/docker-and-kubernetes/pod-creation.drawio new file mode 100644 index 0000000..bd4042d --- /dev/null +++ b/assets/img/docker-and-kubernetes/pod-creation.drawio @@ -0,0 +1 @@ +mxfile host="Electron" modified="2023-06-30T11:18:18.069Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/19.0.3 Chrome/102.0.5005.63 Electron/19.0.3 Safari/537.36" etag="IFS0dK-y9SUW6tHiZXQp" version="19.0.3" type="device">5ZrNctowEICfhmM72MIQjglJfw7tpENn2h5Va4M1CMuR5QB9+kpYNlgyCSE4ZsyFQWt5Je/ut7sy9NBksfoscBJ94wRYz++TVQ/d9nz/anilPrVgnQsCb5gLZoKSXORtBVP6D4ywb6QZJZBWJkrOmaRJVRjyOIZQVmRYCL6sTnvgrLpqgmfgCKYhZq70FyUyMo/lj7byL0BnUbGyNxznVxa4mGyeJI0w4csdEbrroYngXObfFqsJMG27wi75fZ/2XC03JiCWh9zwdbS8i39+J17qzeePD9Pk/sf0g9HyhFlmHnie/VV2ZGbPcl0YQvAsJqB19XvoZhlRCdMEh/rqUnleySK5YGrkqa9GKwgJq73b9UojqOABvgAp1mqKuWFgzLauDpc7TjCiaMf+xTRs3D4r9W4to74Y47zCUL5jKMdCEJNrHXFqFPMYqhapmk/ZQKx/68HHoBj+MTM3g9vV7szbtRnlawJxYtayqoIEixnIlzzvWn/HvkGNfQuZAIYlfapuo87oZoV7TtUGS+deWc613ZbyTIRgbtqNaUuPF1QVeSNLUW4GR9EmBMqnPj4qkBMVOKE6KkCo4G+dIW94XhANugYRahMiv38iivxRuxQFTlikYQQkY2eAkJOqWkZo2DWEgjYRQv6JEELjdhEaOWGh+zgGsnWABnaWahmgq64BNGoToIHdph8LUOC1C9DYCQvCw/kZFKDATlEt81P4qTsAjdsEKLBb9GMBGqJ2AfLcVwnPxUXIcJrS8LnQONTFZ3KI9dCRnrP7eEdR057zHc8JeMwg1XuUXH2EArAEvSNtf04cx6r0JquuTKXgc5hwxsU2CzxQxiwRZnQW63hQPlXZFt3oZElDzK7NhQUlRC9Tm2rrAuZtJ2bLF+PDsq3tsdNl29cdmTtAld2SO3nsaKzeOyG6R7UllmHU02+hVV/Rz2LtrFmsXLOhKu0uVr7d3SGXqxK99wHLPTFdGFj+8ERgOYqaBss9wuUg6XIVQVGi8tKlK1bMCWzMLDV8mKjilpezMMm0INazF7DgJq4uA8Ga0laLoO3d0yHonrk6jqB9Wi6RfCuCjqKGESyW21vb8nLW3ylwGxqpTEseuwoaGuxJjy/0kEFTnBVYXS5n5TvIt3LmKGqaM/cHxiwhefHKi1wqscw63DnaNKEDaWqscfQv7kTmVK1jG0f7XZejqGmaan5oTKhuG7Ok7B1DHktM45o3v51hamD7wW+MKTXc/sMp9+P2b2Lo7j8= \ No newline at end of file diff --git a/assets/img/docker-and-kubernetes/pod-creation.drawio.png b/assets/img/docker-and-kubernetes/pod-creation.drawio.png new file mode 100644 index 0000000..80692bd Binary files /dev/null and b/assets/img/docker-and-kubernetes/pod-creation.drawio.png differ diff --git a/assets/img/docker-and-kubernetes/replica-set-creation.drawio b/assets/img/docker-and-kubernetes/replica-set-creation.drawio new file mode 100644 index 0000000..e5c4fa4 --- /dev/null +++ b/assets/img/docker-and-kubernetes/replica-set-creation.drawio @@ -0,0 +1 @@ +5VhNU9swEP01PjYT2SQkR0iA9tCZDsy05SisTSyQrVRWcNJfX8mWPySZFlJCmHDJaFfyavXe7mqjIJqlmyuBV8lXToAF4ZBsgmgehOFkPFG/WrGtFCM0rhRLQUmlQq3ihv4Goxwa7ZoSyK2FknMm6cpWxjzLIJaWDgvBC3vZgjN71xVegqe4iTHztT8okYk5Vnja6j8DXSb1zmg8rWZSXC82J8kTTHjRUUUXQTQTnMtqlG5mwDR2NS7Vd5dPzDaOCcjkcz4oRleT++js+vvtff5lO59eTxZ3n4yVR8zW5sAP6zuFIzM+y20NhODrjIC2NQyi8yKhEm5WONazhWJe6RKZMiUhNTRWQUjYPOkuakBQwQM8BSm2aon54MTAtrXFokOCUSUd/Otl2NC+bOy2yKiBAecFQIUeUHhFNUIg1DEPjhYavy+4Ig8uAStGY1xCJquElYIz9g7Aa6B5J+CdeOB5EEFGznR1U1LGM7AhsfGDDZU/9XgwMtKtWafH801n2XxbCxm5pNrpUqp2B+JVSgdg5SFfixj+VW98IjpIj3qQrnUCGJb00XajD36zwzdOM9nhObR5bi6Z2kTlvvmqW0odQ068NHJtR2KxBOnZKYOhOfXu8TE63vgIDxkfU5vW6Y7REQ4PGx7j4w2P6KDlA71OfERuGXrj+Dh9UXzEDOc5jf8WIs8l+FDEOeUauRf3rontGdozcROPOAG/1pDL8k+R7qkEYAnaoRI0q+FyGFbdkrQ5zVU79gAzzrhoy8JCpbGjwowuMx0YilzVvEXnuvdSG7EzM5FSQvQ2vZ1bX+T8V/PmFG009Ju3RteNrWhf3dv0o6WXW8/Qa12cnqE951dd3zvUFVjGSaDfDYSbUvnx5lSTHDURIz+pTt8yp5D/TvHBkiocv1JSeYb2nVT+y0lzTbX5lPGB+uULTQEnenh3D/FR55jLC+q5uMKeANvbswPqe7Qpm4uSF6pHNNe9RqLJy3EKlRs1ZyWvlGeDweCIeXMajkbu8Bb28bZDcVRi+z5cJWT7yB5d/AE= \ No newline at end of file diff --git a/assets/img/docker-and-kubernetes/replica-set-creation.drawio.png b/assets/img/docker-and-kubernetes/replica-set-creation.drawio.png new file mode 100644 index 0000000..e28bdae Binary files /dev/null and b/assets/img/docker-and-kubernetes/replica-set-creation.drawio.png differ diff --git a/assets/img/docker-and-kubernetes/service-creation.drawio b/assets/img/docker-and-kubernetes/service-creation.drawio new file mode 100644 index 0000000..25fd95b --- /dev/null +++ b/assets/img/docker-and-kubernetes/service-creation.drawio @@ -0,0 +1 @@ +5VrRbpswFP2aSNvDJsBAk8c2addNnTSpD1v35oAb3Do4M6ZJ+vUzwQTwJW3ahhAlL1F8AWPfc459LtBDw+nim8Cz6CcPCes5VrjooVHPcfp+X/1mgWUe8Gw/D0wEDfOQXQZu6TPRQUtHUxqSpHai5JxJOqsHAx7HJJC1GBaCz+un3XNWv+sMTwgI3AaYwehvGspIT8s5K+PXhE6i4s62P8iPTHFxsp5JEuGQzyshdNlDQ8G5zP9NF0PCstwVecmvu9pwdD0wQWK5zQX94O/3ufU7vXueW8jBN9c/xOiLq8cml8WESajmr5tcyIhPeIzZZRm9EDyNQ5L1aqlWec4N5zMVtFXwgUi51GDiVHIViuSU6aMkDs8zaFQz5jHJI1eUMd1lPqZsIBunqkMJT0VAXpifvU60IijhUyLFUl0nCMOSPtX7x5oqk/V560t/caru7FgFqzWkmtNu0S56kFhMiNQXlZCoP5VRlKEVUG8ATY/3CbNUz+AxHSvyMwBmHap5RCW5neFVxuZKrnVYdK9ESLJ4OfEwofoC10iMbs4rytGhqCIa19oMQC13b02Uf+TsPmuF3Y7VLb3PAL3xjGbpIEKRs3OO2/5hkbx/5CQftEJy5HRL8gEguUrgTI9RmRkpOGMHwHaw13XM9sIXHi3d7XYsi2tuzfv2LM2mJUuS4Itl5zx3zU2va547x85z1ArP/a55jjbxPIyTzlnuHxrLXZAtSPuSkwHDSUKDelrqOdyWoTBFlRx4DTkoYrstHW3PSG2uG0DPV0066KhtnnsAOUH+pSTJxqiWFWViBMEyIz7W3p0qXpvgKirLOpyJcj6PZMgZF+UqdK+WICOEGZ3EGScUrsonoYtMGDTA7FwfmNIwXK2KTbJqIs2HlDUw4LChsmyrgVaoNWn5pyYt09Lbg11py+yobW3BCniOZRD1sueX4hTUtH78WyDgdy6n/qnLydnVVgU6altOsNZe702VopuPH0ggoUc7WlE5ne9RDiwKj1xUZhnumOXJu0W15zqnQGrjHnVKykImqg11VaOyWiusHFiGflBZKhVi+afauMsaX72iOVpUD46Wb1SkqhpWjH1pUt4hSRfZ75SuO3ilo7alC0s3Ous5PsvEOFba9SfZP4nHDFrMfT+x8EwX0ofSchtA91tTFnQTOAxh+or1L4GHcBxmdEwZaTi4qp034nHEi6jrGUgjiLTfgLS58e0M6eL+p+NOzIenaFfuBHTU8hKHoLE8sQraM6H0trQkrZl9BB3jnuS0hbVA7iHpzjVB2FZ35m4JOmpbd9B2rkvt1fsPlamAi/CIdYcMBKwG3dl71R18x7KCwvQXxec1ZvxT8rhUF3wGmO39FS348GY7g/AOK6ia5YeZuTjKr1vR5X8= \ No newline at end of file diff --git a/assets/img/docker-and-kubernetes/service-creation.drawio.png b/assets/img/docker-and-kubernetes/service-creation.drawio.png new file mode 100644 index 0000000..b8818ff Binary files /dev/null and b/assets/img/docker-and-kubernetes/service-creation.drawio.png differ diff --git a/assets/img/elasticsearch/data-table.png b/assets/img/elasticsearch/data-table.png new file mode 100644 index 0000000..b25a877 Binary files /dev/null and b/assets/img/elasticsearch/data-table.png differ diff --git a/assets/img/elasticsearch/discover.drawio b/assets/img/elasticsearch/discover.drawio new file mode 100644 index 0000000..41380df --- /dev/null +++ b/assets/img/elasticsearch/discover.drawio @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/elasticsearch/discover.drawio.png b/assets/img/elasticsearch/discover.drawio.png new file mode 100644 index 0000000..795dd36 Binary files /dev/null and b/assets/img/elasticsearch/discover.drawio.png differ diff --git a/assets/img/elasticsearch/filters.png b/assets/img/elasticsearch/filters.png new file mode 100644 index 0000000..922e6b1 Binary files /dev/null and b/assets/img/elasticsearch/filters.png differ diff --git a/assets/img/elasticsearch/heat-map.png b/assets/img/elasticsearch/heat-map.png new file mode 100644 index 0000000..3d941cb Binary files /dev/null and b/assets/img/elasticsearch/heat-map.png differ diff --git a/assets/img/elasticsearch/interactivity.png b/assets/img/elasticsearch/interactivity.png new file mode 100644 index 0000000..d97efe9 Binary files /dev/null and b/assets/img/elasticsearch/interactivity.png differ diff --git a/assets/img/elasticsearch/line-chart.png b/assets/img/elasticsearch/line-chart.png new file mode 100644 index 0000000..ab3d294 Binary files /dev/null and b/assets/img/elasticsearch/line-chart.png differ diff --git a/assets/img/elasticsearch/metrics.png b/assets/img/elasticsearch/metrics.png new file mode 100644 index 0000000..2f6e0e5 Binary files /dev/null and b/assets/img/elasticsearch/metrics.png differ diff --git a/assets/img/elasticsearch/tag-clouds.png b/assets/img/elasticsearch/tag-clouds.png new file mode 100644 index 0000000..4ebac0c Binary files /dev/null and b/assets/img/elasticsearch/tag-clouds.png differ diff --git a/assets/img/favicons/android-chrome-192x192.png b/assets/img/favicons/android-chrome-192x192.png new file mode 100644 index 0000000..a949d2f Binary files /dev/null and b/assets/img/favicons/android-chrome-192x192.png differ diff --git a/assets/img/favicons/android-chrome-512x512.png b/assets/img/favicons/android-chrome-512x512.png new file mode 100644 index 0000000..a0cdd95 Binary files /dev/null and b/assets/img/favicons/android-chrome-512x512.png differ diff --git a/assets/img/favicons/apple-touch-icon.png b/assets/img/favicons/apple-touch-icon.png new file mode 100644 index 0000000..648097f Binary files /dev/null and b/assets/img/favicons/apple-touch-icon.png differ diff --git a/assets/img/favicons/browserconfig.xml b/assets/img/favicons/browserconfig.xml new file mode 100644 index 0000000..a02a5c7 --- /dev/null +++ b/assets/img/favicons/browserconfig.xml @@ -0,0 +1,13 @@ +--- +layout: compress +--- + + + + + + + #da532c + + + diff --git a/assets/img/favicons/favicon-16x16.png b/assets/img/favicons/favicon-16x16.png new file mode 100644 index 0000000..f44237a Binary files /dev/null and b/assets/img/favicons/favicon-16x16.png differ diff --git a/assets/img/favicons/favicon-32x32.png b/assets/img/favicons/favicon-32x32.png new file mode 100644 index 0000000..d5d021d Binary files /dev/null and b/assets/img/favicons/favicon-32x32.png differ diff --git a/assets/img/favicons/favicon.ico b/assets/img/favicons/favicon.ico new file mode 100644 index 0000000..5611568 Binary files /dev/null and b/assets/img/favicons/favicon.ico differ diff --git a/assets/img/favicons/mstile-150x150.png b/assets/img/favicons/mstile-150x150.png new file mode 100644 index 0000000..c0d045e Binary files /dev/null and b/assets/img/favicons/mstile-150x150.png differ diff --git a/assets/img/favicons/site.webmanifest b/assets/img/favicons/site.webmanifest new file mode 100644 index 0000000..03c6113 --- /dev/null +++ b/assets/img/favicons/site.webmanifest @@ -0,0 +1,26 @@ +--- +layout: compress +--- + +{% assign favicon_path = "/assets/img/favicons" | relative_url %} + +{ + "name": "{{ site.title }}", + "short_name": "{{ site.title }}", + "description": "{{ site.description }}", + "icons": [ + { + "src": "{{ favicon_path }}/android-chrome-192x192.png", + "sizes": "192x192", + "type": "image/png" + }, + { + "src": "{{ favicon_path }}/android-chrome-512x512.png", + "sizes": "512x512", + "type": "image/png" + }], + "start_url": "{{ '/index.html' | relative_url }}", + "theme_color": "#2a1e6b", + "background_color": "#ffffff", + "display": "fullscreen" +} diff --git a/assets/img/hadoop/hadoop1.x.drawio b/assets/img/hadoop/hadoop1.x.drawio new file mode 100644 index 0000000..37348e0 --- /dev/null +++ b/assets/img/hadoop/hadoop1.x.drawio @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/hadoop/hadoop1.x.drawio.png b/assets/img/hadoop/hadoop1.x.drawio.png new file mode 100644 index 0000000..a6e8323 Binary files /dev/null and b/assets/img/hadoop/hadoop1.x.drawio.png differ diff --git a/assets/img/hadoop/hadoop2.x.drawio b/assets/img/hadoop/hadoop2.x.drawio new file mode 100644 index 0000000..7dc200b --- /dev/null +++ b/assets/img/hadoop/hadoop2.x.drawio @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/hadoop/hadoop2.x.drawio.png b/assets/img/hadoop/hadoop2.x.drawio.png new file mode 100644 index 0000000..3cc44da Binary files /dev/null and b/assets/img/hadoop/hadoop2.x.drawio.png differ diff --git a/assets/img/hadoop/hdfs.drawio b/assets/img/hadoop/hdfs.drawio new file mode 100644 index 0000000..b104b4d --- /dev/null +++ b/assets/img/hadoop/hdfs.drawio @@ -0,0 +1,86 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/hadoop/hdfs.drawio.png b/assets/img/hadoop/hdfs.drawio.png new file mode 100644 index 0000000..a01b444 Binary files /dev/null and b/assets/img/hadoop/hdfs.drawio.png differ diff --git a/assets/img/high-level-design/2-phase-locking.png b/assets/img/high-level-design/2-phase-locking.png new file mode 100644 index 0000000..dd5f630 Binary files /dev/null and b/assets/img/high-level-design/2-phase-locking.png differ diff --git a/assets/img/high-level-design/anti-corruption-adapter-layer-pattern.png b/assets/img/high-level-design/anti-corruption-adapter-layer-pattern.png new file mode 100644 index 0000000..a58fd08 Binary files /dev/null and b/assets/img/high-level-design/anti-corruption-adapter-layer-pattern.png differ diff --git a/assets/img/high-level-design/b+-tree.png b/assets/img/high-level-design/b+-tree.png new file mode 100644 index 0000000..3db915c Binary files /dev/null and b/assets/img/high-level-design/b+-tree.png differ diff --git a/assets/img/high-level-design/backends-for-frontends-pattern.png b/assets/img/high-level-design/backends-for-frontends-pattern.png new file mode 100644 index 0000000..9cdf2cc Binary files /dev/null and b/assets/img/high-level-design/backends-for-frontends-pattern.png differ diff --git a/assets/img/high-level-design/blue-green-deployment-pattern.png b/assets/img/high-level-design/blue-green-deployment-pattern.png new file mode 100644 index 0000000..6b7df6c Binary files /dev/null and b/assets/img/high-level-design/blue-green-deployment-pattern.png differ diff --git a/assets/img/high-level-design/canary-testing.png b/assets/img/high-level-design/canary-testing.png new file mode 100644 index 0000000..4b80cae Binary files /dev/null and b/assets/img/high-level-design/canary-testing.png differ diff --git a/assets/img/high-level-design/cap-theorem-introduction.excalidraw b/assets/img/high-level-design/cap-theorem-introduction.excalidraw new file mode 100644 index 0000000..495853c --- /dev/null +++ b/assets/img/high-level-design/cap-theorem-introduction.excalidraw @@ -0,0 +1,1043 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "cZTSQwT7h9aEkKp0RZYUC", + "type": "ellipse", + "x": 470, + "y": 138, + "width": 61, + "height": 64, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1321756799, + "version": 57, + "versionNonce": 1991398737, + "isDeleted": false, + "boundElements": [ + { + "id": "_B02NJtgoNBJYYYLXVhP-", + "type": "arrow" + }, + { + "type": "text", + "id": "jp8VWtYlR34L9aHbxlY4k" + }, + { + "id": "XBVA2zJQJ21mjmvFhum6O", + "type": "arrow" + }, + { + "id": "HfyHktbDks70F3aR6q8_7", + "type": "arrow" + } + ], + "updated": 1710153362633, + "link": null, + "locked": false + }, + { + "id": "jp8VWtYlR34L9aHbxlY4k", + "type": "text", + "x": 494.5738681738103, + "y": 157.87258300203047, + "width": 11.71875, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1694838033, + "version": 4, + "versionNonce": 1802625649, + "isDeleted": false, + "boundElements": null, + "updated": 1710153277350, + "link": null, + "locked": false, + "text": "6", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "cZTSQwT7h9aEkKp0RZYUC", + "originalText": "6", + "lineHeight": 1.2 + }, + { + "type": "ellipse", + "version": 141, + "versionNonce": 701269617, + "isDeleted": false, + "id": "TxBVq2G_GfkJgm8jBI7sm", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 638.5, + "y": 162, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 61, + "height": 64, + "seed": 1603753681, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "hCiqyN7QmOnn0eQKJ4K3s", + "type": "arrow" + }, + { + "type": "text", + "id": "tDLgiNFO90WiS9jCe7Ks5" + }, + { + "id": "XBVA2zJQJ21mjmvFhum6O", + "type": "arrow" + }, + { + "id": "5I4b30WuFigten-WiF8QS", + "type": "arrow" + } + ], + "updated": 1710153388343, + "link": null, + "locked": false + }, + { + "id": "tDLgiNFO90WiS9jCe7Ks5", + "type": "text", + "x": 663.0738681738103, + "y": 181.87258300203047, + "width": 11.71875, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 431929905, + "version": 73, + "versionNonce": 1479839825, + "isDeleted": false, + "boundElements": null, + "updated": 1710153388343, + "link": null, + "locked": false, + "text": "5", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "TxBVq2G_GfkJgm8jBI7sm", + "originalText": "5", + "lineHeight": 1.2 + }, + { + "type": "ellipse", + "version": 111, + "versionNonce": 262119185, + "isDeleted": false, + "id": "TmpUxOmXkPJBHUH7ixRGh", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 472.5, + "y": 299, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 61, + "height": 64, + "seed": 2073728657, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "type": "text", + "id": "hgBEfZcxjJm0souY2Qz3c" + }, + { + "id": "5I4b30WuFigten-WiF8QS", + "type": "arrow" + }, + { + "id": "HfyHktbDks70F3aR6q8_7", + "type": "arrow" + } + ], + "updated": 1710153374566, + "link": null, + "locked": false + }, + { + "id": "hgBEfZcxjJm0souY2Qz3c", + "type": "text", + "x": 497.0738681738103, + "y": 318.87258300203047, + "width": 11.71875, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1246776145, + "version": 32, + "versionNonce": 291214577, + "isDeleted": false, + "boundElements": null, + "updated": 1710153374566, + "link": null, + "locked": false, + "text": "5", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "TmpUxOmXkPJBHUH7ixRGh", + "originalText": "5", + "lineHeight": 1.2 + }, + { + "type": "ellipse", + "version": 204, + "versionNonce": 14578047, + "isDeleted": false, + "id": "AJ1m4LrxsYWp3Nbzaq4zQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 310.790999465622, + "y": 170.82009220775217, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 1654595327, + "groupIds": [ + "AY0cinYhwJeho5EwyTham" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "_B02NJtgoNBJYYYLXVhP-", + "type": "arrow" + } + ], + "updated": 1710153265599, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 227, + "versionNonce": 1469753841, + "isDeleted": false, + "id": "zbwXexqCAbSOzZMYRFfXQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 320.790999465622, + "y": 191.82009220775217, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 343992095, + "groupIds": [ + "AY0cinYhwJeho5EwyTham" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 187, + "versionNonce": 70379935, + "isDeleted": false, + "id": "QbDhJFXPSHETpEc-VRuRb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 320.790999465622, + "y": 241.82009220775217, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 322218815, + "groupIds": [ + "AY0cinYhwJeho5EwyTham" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 196, + "versionNonce": 2138194897, + "isDeleted": false, + "id": "fQ1ZSjURUSBkq9OhBT8UC", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 321.790999465622, + "y": 247.82009220775217, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 1796926303, + "groupIds": [ + "AY0cinYhwJeho5EwyTham" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 195, + "versionNonce": 446782911, + "isDeleted": false, + "id": "hIscJMbAgasZ483xaVPce", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 321.790999465622, + "y": 212.82009220775217, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 1216620415, + "groupIds": [ + "AY0cinYhwJeho5EwyTham" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 193, + "versionNonce": 1627103665, + "isDeleted": false, + "id": "xayh_HsV0ea-ZNXXO6p26", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 319.790999465622, + "y": 212.82009220775217, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 2041634719, + "groupIds": [ + "AY0cinYhwJeho5EwyTham" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "id": "_B02NJtgoNBJYYYLXVhP-", + "type": "arrow", + "x": 344, + "y": 188, + "width": 116, + "height": 18, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1290861215, + "version": 31, + "versionNonce": 2001554911, + "isDeleted": false, + "boundElements": null, + "updated": 1710153265599, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 116, + -18 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "AJ1m4LrxsYWp3Nbzaq4zQ", + "focus": 0.7849771931188008, + "gap": 11.893719232587433 + }, + "endBinding": { + "elementId": "cZTSQwT7h9aEkKp0RZYUC", + "focus": 0.19427677262720397, + "gap": 10 + }, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "type": "ellipse", + "version": 217, + "versionNonce": 683348447, + "isDeleted": false, + "id": "TWyTxagvj4byeb2LOMk_a", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 801.2368813798204, + "y": 213.10194573283195, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 2120917951, + "groupIds": [ + "POSJwaQGe1EUvePNCTKgP" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "hCiqyN7QmOnn0eQKJ4K3s", + "type": "arrow" + } + ], + "updated": 1710153269818, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 240, + "versionNonce": 356923903, + "isDeleted": false, + "id": "fgPkAfF4gi4Y2L-370fbi", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 811.2368813798204, + "y": 234.10194573283195, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 1576915935, + "groupIds": [ + "POSJwaQGe1EUvePNCTKgP" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 200, + "versionNonce": 1959897457, + "isDeleted": false, + "id": "ikqDOf96tyFHfO7pMcCtz", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 811.2368813798204, + "y": 284.10194573283195, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 536215551, + "groupIds": [ + "POSJwaQGe1EUvePNCTKgP" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 209, + "versionNonce": 965517855, + "isDeleted": false, + "id": "QDDr5wktsOOSCWLzBcqGw", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 812.2368813798204, + "y": 290.10194573283195, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 643016735, + "groupIds": [ + "POSJwaQGe1EUvePNCTKgP" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 208, + "versionNonce": 1822249809, + "isDeleted": false, + "id": "nPNIb3-v3DLTZHfgAoFbY", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 812.2368813798204, + "y": 255.10194573283195, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 1057442879, + "groupIds": [ + "POSJwaQGe1EUvePNCTKgP" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 206, + "versionNonce": 890308159, + "isDeleted": false, + "id": "u_ZFxAK5vmcAc0BkdI2uf", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 810.2368813798204, + "y": 255.10194573283195, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 636578911, + "groupIds": [ + "POSJwaQGe1EUvePNCTKgP" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710153265599, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "id": "hCiqyN7QmOnn0eQKJ4K3s", + "type": "arrow", + "x": 794.3610349594956, + "y": 220.84369204641044, + "width": 84.77075619139328, + "height": 20.7675972154112, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2074833681, + "version": 173, + "versionNonce": 240611345, + "isDeleted": false, + "boundElements": null, + "updated": 1710153388343, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -84.77075619139328, + -20.7675972154112 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "TWyTxagvj4byeb2LOMk_a", + "focus": -0.054852948965267594, + "gap": 7.2462859154274 + }, + "endBinding": { + "elementId": "TxBVq2G_GfkJgm8jBI7sm", + "focus": -0.11770655895007504, + "gap": 10.511343268280434 + }, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "XBVA2zJQJ21mjmvFhum6O", + "type": "arrow", + "x": 543.584779481007, + "y": 169.372306767417, + "width": 92.22979777396336, + "height": 32.60606055213128, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1051091057, + "version": 174, + "versionNonce": 630546385, + "isDeleted": false, + "boundElements": null, + "updated": 1710153388343, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 92.22979777396336, + 32.60606055213128 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "cZTSQwT7h9aEkKp0RZYUC", + "focus": -0.4669635423107049, + "gap": 12.589049176495859 + }, + "endBinding": { + "elementId": "TxBVq2G_GfkJgm8jBI7sm", + "focus": -0.5837044278883812, + "gap": 3.553910870509334 + }, + "startArrowhead": "triangle", + "endArrowhead": "triangle" + }, + { + "id": "5I4b30WuFigten-WiF8QS", + "type": "arrow", + "x": 655.915835191837, + "y": 234.59197018015885, + "width": 119.29816681808734, + "height": 77.75029897767553, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 753889649, + "version": 113, + "versionNonce": 1047158673, + "isDeleted": false, + "boundElements": null, + "updated": 1710153388343, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -119.29816681808734, + 77.75029897767553 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "TxBVq2G_GfkJgm8jBI7sm", + "focus": -0.8504524746394679, + "gap": 10.796795872030433 + }, + "endBinding": { + "elementId": "TmpUxOmXkPJBHUH7ixRGh", + "focus": 0.08632412336566028, + "gap": 7.610268376597304 + }, + "startArrowhead": "triangle", + "endArrowhead": "triangle" + }, + { + "id": "HfyHktbDks70F3aR6q8_7", + "type": "arrow", + "x": 502.70526856282265, + "y": 291.50581223504287, + "width": 4.913678825258955, + "height": 82.9950153189236, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 436366623, + "version": 87, + "versionNonce": 759501937, + "isDeleted": false, + "boundElements": null, + "updated": 1710153374566, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 4.913678825258955, + -82.9950153189236 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "TmpUxOmXkPJBHUH7ixRGh", + "focus": -0.08982127757646, + "gap": 7.495376032460523 + }, + "endBinding": { + "elementId": "cZTSQwT7h9aEkKp0RZYUC", + "focus": -0.3075698292769691, + "gap": 7.2157139089914715 + }, + "startArrowhead": "triangle", + "endArrowhead": "triangle" + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/cap-theorem-introduction.svg b/assets/img/high-level-design/cap-theorem-introduction.svg new file mode 100644 index 0000000..86c4917 --- /dev/null +++ b/assets/img/high-level-design/cap-theorem-introduction.svg @@ -0,0 +1,21 @@ + + + + + + + + 655 \ No newline at end of file diff --git a/assets/img/high-level-design/choreography-pattern.png b/assets/img/high-level-design/choreography-pattern.png new file mode 100644 index 0000000..9a8531d Binary files /dev/null and b/assets/img/high-level-design/choreography-pattern.png differ diff --git a/assets/img/high-level-design/circuit-breaker-pattern.png b/assets/img/high-level-design/circuit-breaker-pattern.png new file mode 100644 index 0000000..63773b5 Binary files /dev/null and b/assets/img/high-level-design/circuit-breaker-pattern.png differ diff --git a/assets/img/high-level-design/consistent-hashing-disadvantage.png b/assets/img/high-level-design/consistent-hashing-disadvantage.png new file mode 100644 index 0000000..327adb6 Binary files /dev/null and b/assets/img/high-level-design/consistent-hashing-disadvantage.png differ diff --git a/assets/img/high-level-design/consistent-hashing-replication.png b/assets/img/high-level-design/consistent-hashing-replication.png new file mode 100644 index 0000000..d15e66d Binary files /dev/null and b/assets/img/high-level-design/consistent-hashing-replication.png differ diff --git a/assets/img/high-level-design/consistent-hashing-workaround.png b/assets/img/high-level-design/consistent-hashing-workaround.png new file mode 100644 index 0000000..4b4e0b4 Binary files /dev/null and b/assets/img/high-level-design/consistent-hashing-workaround.png differ diff --git a/assets/img/high-level-design/consistent-hashing.png b/assets/img/high-level-design/consistent-hashing.png new file mode 100644 index 0000000..e87ca94 Binary files /dev/null and b/assets/img/high-level-design/consistent-hashing.png differ diff --git a/assets/img/high-level-design/cqrs.png b/assets/img/high-level-design/cqrs.png new file mode 100644 index 0000000..eaa5e59 Binary files /dev/null and b/assets/img/high-level-design/cqrs.png differ diff --git a/assets/img/high-level-design/degradation-point.excalidraw b/assets/img/high-level-design/degradation-point.excalidraw new file mode 100644 index 0000000..92f574f --- /dev/null +++ b/assets/img/high-level-design/degradation-point.excalidraw @@ -0,0 +1,512 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "8t2bTIYw3YP7wd4y-HPpj", + "type": "arrow", + "x": 251, + "y": 448, + "width": 2.842170943040401e-14, + "height": 213.00000000000003, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 85199810, + "version": 31, + "versionNonce": 1319681986, + "isDeleted": false, + "boundElements": null, + "updated": 1709965946163, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 2.842170943040401e-14, + -213.00000000000003 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "b98LjBuCbkM76BT7J3l-R", + "type": "arrow", + "x": 248, + "y": 446, + "width": 294, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2113670274, + "version": 79, + "versionNonce": 1561695582, + "isDeleted": false, + "boundElements": null, + "updated": 1709965946163, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 294, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "H4nzm3n1M_G5ZG6QIAmPU", + "type": "arrow", + "x": 252, + "y": 344, + "width": 165, + "height": 72, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 496634178, + "version": 287, + "versionNonce": 1488942978, + "isDeleted": false, + "boundElements": null, + "updated": 1709965946163, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 141, + -9 + ], + [ + 165, + -72 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "type": "arrow", + "version": 57, + "versionNonce": 783759774, + "isDeleted": false, + "id": "4YW3uXxy6Ysz1vd7RL3Lw", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 718.622055205726, + "y": 449.86003382032277, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 2.842170943040401e-14, + "height": 213.00000000000003, + "seed": 1405625282, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965946163, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 2.842170943040401e-14, + -213.00000000000003 + ] + ] + }, + { + "type": "arrow", + "version": 105, + "versionNonce": 648623938, + "isDeleted": false, + "id": "4AmTRAByf5gBnFQYeec2Z", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 715.622055205726, + "y": 447.86003382032277, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 294, + "height": 0, + "seed": 1280465794, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965946163, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 294, + 0 + ] + ] + }, + { + "type": "arrow", + "version": 477, + "versionNonce": 1562007006, + "isDeleted": false, + "id": "h1qGVoLkhIzWupDVLuO9f", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 721.622055205726, + "y": 445.86003382032277, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 163, + "height": 109, + "seed": 1849662274, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965946163, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 139, + -109 + ], + [ + 163, + -27 + ] + ] + }, + { + "id": "HNqz6AFnms_GyBoXRs2UF", + "type": "text", + "x": 122, + "y": 337, + "width": 93.75, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1556972062, + "version": 27, + "versionNonce": 1489305346, + "isDeleted": false, + "boundElements": null, + "updated": 1709965946163, + "link": null, + "locked": false, + "text": "Response\n Time", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Response\n Time", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 81, + "versionNonce": 1095756318, + "isDeleted": false, + "id": "UenHDOxn87DabKl8oyiRh", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 582.125, + "y": 336, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 117.1875, + "height": 24, + "seed": 1428543362, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709965946163, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Throughput", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Throughput", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 140, + "versionNonce": 730975938, + "isDeleted": false, + "id": "khqMMsfxhMxULJVXZ2dek", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 356.40625, + "y": 466, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 46.875, + "height": 24, + "seed": 1596700738, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709965946163, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Load", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Load", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 190, + "versionNonce": 1783166558, + "isDeleted": false, + "id": "jzJAjiUGPaaPw0RMU5CGr", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 836.5625, + "y": 466, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 46.875, + "height": 24, + "seed": 754231362, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709965946163, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Load", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Load", + "lineHeight": 1.2 + }, + { + "id": "GZAupg3dL6KpjTS_tiWgD", + "type": "line", + "x": 397, + "y": 332, + "width": 0, + "height": 111, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 433751618, + "version": 33, + "versionNonce": 152994462, + "isDeleted": false, + "boundElements": null, + "updated": 1709965949450, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 111 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "id": "kTWbbC798BWAR78mnq1ui", + "type": "line", + "x": 862, + "y": 336, + "width": 0, + "height": 107, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1925309954, + "version": 36, + "versionNonce": 1119177630, + "isDeleted": false, + "boundElements": null, + "updated": 1709965956751, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 107 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/degradation-point.svg b/assets/img/high-level-design/degradation-point.svg new file mode 100644 index 0000000..5d0e76c --- /dev/null +++ b/assets/img/high-level-design/degradation-point.svg @@ -0,0 +1,21 @@ + + + + + + + + Response TimeThroughputLoadLoad \ No newline at end of file diff --git a/assets/img/high-level-design/event-sourcing+cqrs.png b/assets/img/high-level-design/event-sourcing+cqrs.png new file mode 100644 index 0000000..9aa46c5 Binary files /dev/null and b/assets/img/high-level-design/event-sourcing+cqrs.png differ diff --git a/assets/img/high-level-design/execution-orchestrator-pattern.png b/assets/img/high-level-design/execution-orchestrator-pattern.png new file mode 100644 index 0000000..de0eaf7 Binary files /dev/null and b/assets/img/high-level-design/execution-orchestrator-pattern.png differ diff --git a/assets/img/high-level-design/features-of-the-system.excalidraw b/assets/img/high-level-design/features-of-the-system.excalidraw new file mode 100644 index 0000000..d8443f4 --- /dev/null +++ b/assets/img/high-level-design/features-of-the-system.excalidraw @@ -0,0 +1,2684 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "type": "ellipse", + "version": 250, + "versionNonce": 372959362, + "isDeleted": false, + "id": "9npojYAx0wJ43XtyDbxJq", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 456.40261024851816, + "y": 231.65552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 1428488734, + "groupIds": [ + "yzsDnNDPPeBFCJCKKvwgs" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660308, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 270, + "versionNonce": 2118014622, + "isDeleted": false, + "id": "H9yA11TDFq4sHJPzLHwdc", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 466.40261024851816, + "y": 252.65552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 1941638750, + "groupIds": [ + "yzsDnNDPPeBFCJCKKvwgs" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 230, + "versionNonce": 1282304578, + "isDeleted": false, + "id": "2Y7sy29JCcopGONxKvrNW", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 466.40261024851816, + "y": 302.65552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 608539294, + "groupIds": [ + "yzsDnNDPPeBFCJCKKvwgs" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 239, + "versionNonce": 1203098334, + "isDeleted": false, + "id": "S7tOfMgovXK_1-rOhfcRX", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 467.40261024851816, + "y": 308.65552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 561352414, + "groupIds": [ + "yzsDnNDPPeBFCJCKKvwgs" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 238, + "versionNonce": 359387650, + "isDeleted": false, + "id": "0pyC_3hjq6gYsgo5PFXbq", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 467.40261024851816, + "y": 273.65552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 668730142, + "groupIds": [ + "yzsDnNDPPeBFCJCKKvwgs" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 236, + "versionNonce": 628344606, + "isDeleted": false, + "id": "tbZUJjNhnAHhp8MbEFP_1", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 465.40261024851816, + "y": 273.65552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 971632478, + "groupIds": [ + "yzsDnNDPPeBFCJCKKvwgs" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "type": "ellipse", + "version": 208, + "versionNonce": 1910915522, + "isDeleted": false, + "id": "2skvFOt_-CkUDvAqMlsR2", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 983.9897521165245, + "y": 234.60035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 807231170, + "groupIds": [ + "NuweDgEEev0cYpZrrRiqA" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 232, + "versionNonce": 938029918, + "isDeleted": false, + "id": "RLEHxic60tJA2HsZWTlTC", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 993.9897521165245, + "y": 255.60035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 4732546, + "groupIds": [ + "NuweDgEEev0cYpZrrRiqA" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 192, + "versionNonce": 2096937346, + "isDeleted": false, + "id": "-B7z2E04xqIKsuJmht7so", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 993.9897521165245, + "y": 305.60035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 2093094466, + "groupIds": [ + "NuweDgEEev0cYpZrrRiqA" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 201, + "versionNonce": 1768363934, + "isDeleted": false, + "id": "hrXr_22oiA10oDFgFlQix", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 994.9897521165245, + "y": 311.60035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 1449786882, + "groupIds": [ + "NuweDgEEev0cYpZrrRiqA" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 200, + "versionNonce": 367933762, + "isDeleted": false, + "id": "OMoL1v_iW-rtxNYfKw0YO", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 994.9897521165245, + "y": 276.60035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 1245638082, + "groupIds": [ + "NuweDgEEev0cYpZrrRiqA" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 198, + "versionNonce": 840856542, + "isDeleted": false, + "id": "Rn85q_jlgfximLNgIQQcL", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 992.9897521165245, + "y": 276.60035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 492823938, + "groupIds": [ + "NuweDgEEev0cYpZrrRiqA" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957660307, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "id": "YjRASZE3oI1VO0vRO-6Bc", + "type": "rectangle", + "x": 645, + "y": 236, + "width": 179, + "height": 67, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 481661378, + "version": 57, + "versionNonce": 1545191682, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "gXzOTe5tvHiWtwELJAkl4" + } + ], + "updated": 1709957660307, + "link": null, + "locked": false + }, + { + "id": "gXzOTe5tvHiWtwELJAkl4", + "type": "text", + "x": 675.90625, + "y": 245.5, + "width": 117.1875, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2067460802, + "version": 30, + "versionNonce": 1545619486, + "isDeleted": false, + "boundElements": null, + "updated": 1709957660308, + "link": null, + "locked": false, + "text": "Hitchikers\nService", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "YjRASZE3oI1VO0vRO-6Bc", + "originalText": "Hitchikers\nService", + "lineHeight": 1.2 + }, + { + "id": "c7cgnoRTqbFCE3N8WvJkF", + "type": "line", + "x": 465, + "y": 315, + "width": 1, + "height": 419, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1703839810, + "version": 99, + "versionNonce": 900563650, + "isDeleted": false, + "boundElements": null, + "updated": 1709957686283, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 1, + 419 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "type": "line", + "version": 164, + "versionNonce": 717005314, + "isDeleted": false, + "id": "hu_WYdmYATCr1zr8-eMB6", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 996.2432590988744, + "y": 318.48996176640503, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 5, + "height": 411, + "seed": 1256573406, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957690757, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 5, + 411 + ] + ] + }, + { + "type": "line", + "version": 144, + "versionNonce": 578320770, + "isDeleted": false, + "id": "zUX0FdHjhwUJfKTA9JJ4O", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 736.2432590988744, + "y": 307.48996176640503, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 428, + "seed": 50284446, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957712794, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 428 + ] + ] + }, + { + "id": "Vcwii2qNDkipOWQe6US6E", + "type": "arrow", + "x": 995, + "y": 340, + "width": 254, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 631603458, + "version": 70, + "versionNonce": 917423774, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "FvQbzILEV6FJgX-DCiHuQ" + } + ], + "updated": 1709957766334, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -254, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "FvQbzILEV6FJgX-DCiHuQ", + "type": "text", + "x": 821.125, + "y": 316, + "width": 93.75, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1147329886, + "version": 19, + "versionNonce": 1031881410, + "isDeleted": false, + "boundElements": null, + "updated": 1709957765608, + "link": null, + "locked": false, + "text": "Ready to\nPickup", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Vcwii2qNDkipOWQe6US6E", + "originalText": "Ready to\nPickup", + "lineHeight": 1.2 + }, + { + "type": "arrow", + "version": 161, + "versionNonce": 2032344578, + "isDeleted": false, + "id": "9vDRefa5U4AEbvUOa74CG", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 471.1299963510269, + "y": 384.22480994316584, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 262, + "height": 1, + "seed": 482067806, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "type": "text", + "id": "-Mjz3vGu7DSl5YI7e8XLE" + } + ], + "updated": 1709957775656, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 262, + -1 + ] + ] + }, + { + "id": "-Mjz3vGu7DSl5YI7e8XLE", + "type": "text", + "x": 561.1143713510269, + "y": 359.72480994316584, + "width": 82.03125, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1887975198, + "version": 14, + "versionNonce": 1210005058, + "isDeleted": false, + "boundElements": null, + "updated": 1709957774706, + "link": null, + "locked": false, + "text": "Request\nRide", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "9vDRefa5U4AEbvUOa74CG", + "originalText": "Request\nRide", + "lineHeight": 1.2 + }, + { + "id": "7CJ7ilsEAyD9scEwqssge", + "type": "arrow", + "x": 727, + "y": 434, + "width": 73, + "height": 115, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2027163074, + "version": 250, + "versionNonce": 1405554398, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "pTyoM30MvMNNQQHpKkMQm" + } + ], + "updated": 1709957819426, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -63, + 63 + ], + [ + 10, + 115 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "pTyoM30MvMNNQQHpKkMQm", + "type": "text", + "x": 634.703125, + "y": 485, + "width": 58.59375, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1851226910, + "version": 29, + "versionNonce": 825265794, + "isDeleted": false, + "boundElements": null, + "updated": 1709957818506, + "link": null, + "locked": false, + "text": "Match", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "7CJ7ilsEAyD9scEwqssge", + "originalText": "Match", + "lineHeight": 1.2 + }, + { + "id": "IuRfraODVptK8RaA0gCS2", + "type": "arrow", + "x": 736, + "y": 575, + "width": 261, + "height": 1, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 335381278, + "version": 76, + "versionNonce": 1182385282, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "mO6w3l1ixI_yPQ3YNbskJ" + } + ], + "updated": 1709957850008, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 261, + 1 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "mO6w3l1ixI_yPQ3YNbskJ", + "type": "text", + "x": 831.34375, + "y": 563.5, + "width": 70.3125, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 202706626, + "version": 7, + "versionNonce": 301345822, + "isDeleted": false, + "boundElements": null, + "updated": 1709957849070, + "link": null, + "locked": false, + "text": "Notify", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "IuRfraODVptK8RaA0gCS2", + "originalText": "Notify", + "lineHeight": 1.2 + }, + { + "type": "arrow", + "version": 142, + "versionNonce": 318535490, + "isDeleted": false, + "id": "bc8m9-a9EWJXhP9CZmwkn", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 732.5633472323836, + "y": 574.9768911687191, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 262, + "height": 2, + "seed": 231426882, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "type": "text", + "id": "aAhVsjYtdaqkFn_d3L3bj" + } + ], + "updated": 1709957846112, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -262, + 2 + ] + ] + }, + { + "id": "aAhVsjYtdaqkFn_d3L3bj", + "type": "text", + "x": 566.4070972323836, + "y": 563.9768911687191, + "width": 70.3125, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 795144706, + "version": 9, + "versionNonce": 1741051230, + "isDeleted": false, + "boundElements": null, + "updated": 1709957845515, + "link": null, + "locked": false, + "text": "Notify", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "bc8m9-a9EWJXhP9CZmwkn", + "originalText": "Notify", + "lineHeight": 1.2 + }, + { + "id": "sIWYQg5q60-uTt2ho2SyA", + "type": "arrow", + "x": 999, + "y": 672, + "width": 264, + "height": 1, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1636951106, + "version": 56, + "versionNonce": 13386114, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "HTaSAWo0f2xlsF6Kldz_2" + } + ], + "updated": 1709957875337, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -264, + -1 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "HTaSAWo0f2xlsF6Kldz_2", + "type": "text", + "x": 837.703125, + "y": 647.5, + "width": 58.59375, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1619056258, + "version": 18, + "versionNonce": 849264414, + "isDeleted": false, + "boundElements": null, + "updated": 1709957874165, + "link": null, + "locked": false, + "text": "Start\nRide", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "sIWYQg5q60-uTt2ho2SyA", + "originalText": "Start\nRide", + "lineHeight": 1.2 + }, + { + "id": "YgFiuY3t1tM920rF3PQKD", + "type": "arrow", + "x": 733, + "y": 721, + "width": 264, + "height": 1, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 109887810, + "version": 76, + "versionNonce": 485816962, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "sDrVl3PHTDXzZWg9eCauH" + } + ], + "updated": 1709957886376, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -264, + 1 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "sDrVl3PHTDXzZWg9eCauH", + "type": "text", + "x": 559.984375, + "y": 697.5, + "width": 82.03125, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 293813890, + "version": 14, + "versionNonce": 206820894, + "isDeleted": false, + "boundElements": null, + "updated": 1709957885517, + "link": null, + "locked": false, + "text": "Ride\nStarted", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "YgFiuY3t1tM920rF3PQKD", + "originalText": "Ride\nStarted", + "lineHeight": 1.2 + }, + { + "id": "9kkMZGdCCdeiqYQrkwCUY", + "type": "text", + "x": 645, + "y": 168, + "width": 175.78125, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 916805662, + "version": 20, + "versionNonce": 955309058, + "isDeleted": false, + "boundElements": null, + "updated": 1709957971242, + "link": null, + "locked": false, + "text": "Ride Initiation", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Ride Initiation", + "lineHeight": 1.2 + }, + { + "type": "ellipse", + "version": 366, + "versionNonce": 312490690, + "isDeleted": false, + "id": "UnwcpXgA4ENFYugusa29-", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1173.6379065477618, + "y": 228.90552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 494097886, + "groupIds": [ + "Hu-9uOIkU2s04N9xqXCyY" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 386, + "versionNonce": 1175425666, + "isDeleted": false, + "id": "IZK4ghzy6V37VhH74RWyW", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1183.6379065477618, + "y": 249.90552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 243853854, + "groupIds": [ + "Hu-9uOIkU2s04N9xqXCyY" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 346, + "versionNonce": 1228796482, + "isDeleted": false, + "id": "PVHSKPy2RRBF0IawFefD2", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1183.6379065477618, + "y": 299.90552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 677508702, + "groupIds": [ + "Hu-9uOIkU2s04N9xqXCyY" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 355, + "versionNonce": 1594513922, + "isDeleted": false, + "id": "9-dfAxvWrbeBrfhU3yKvt", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1184.6379065477618, + "y": 305.90552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 1942977182, + "groupIds": [ + "Hu-9uOIkU2s04N9xqXCyY" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 354, + "versionNonce": 376687042, + "isDeleted": false, + "id": "4u_Y-sH_xZnmnqZ6ldshg", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1184.6379065477618, + "y": 270.90552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 931296990, + "groupIds": [ + "Hu-9uOIkU2s04N9xqXCyY" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 352, + "versionNonce": 1184448898, + "isDeleted": false, + "id": "nlk5aTkI7-JsJmWwbuUII", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1182.6379065477618, + "y": 270.90552048887594, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 1007414046, + "groupIds": [ + "Hu-9uOIkU2s04N9xqXCyY" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "type": "ellipse", + "version": 324, + "versionNonce": 179945794, + "isDeleted": false, + "id": "Gkson0JfePMCO97FcpWrr", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1701.225048415768, + "y": 231.85035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 266131294, + "groupIds": [ + "xoQypCq5g1drMxI4tDsHd" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 348, + "versionNonce": 1043613954, + "isDeleted": false, + "id": "NEnVlNu8U-CKtNk1t3qLs", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1711.225048415768, + "y": 252.85035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 1262893982, + "groupIds": [ + "xoQypCq5g1drMxI4tDsHd" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 308, + "versionNonce": 2022168770, + "isDeleted": false, + "id": "BfAJvotEbAyZ5MuNxiy54", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1711.225048415768, + "y": 302.85035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 1417222110, + "groupIds": [ + "xoQypCq5g1drMxI4tDsHd" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 317, + "versionNonce": 156463234, + "isDeleted": false, + "id": "f4cceANDJSTby9EB-LaQA", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1712.225048415768, + "y": 308.85035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 596012062, + "groupIds": [ + "xoQypCq5g1drMxI4tDsHd" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 316, + "versionNonce": 1951797314, + "isDeleted": false, + "id": "ZtGXReFMyvBRZ8Fogwx8G", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1712.225048415768, + "y": 273.85035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 106903646, + "groupIds": [ + "xoQypCq5g1drMxI4tDsHd" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 314, + "versionNonce": 1835972610, + "isDeleted": false, + "id": "M7kif7QDFOUpVvzjynxnT", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1710.225048415768, + "y": 273.85035292990506, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 1691647134, + "groupIds": [ + "xoQypCq5g1drMxI4tDsHd" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "type": "rectangle", + "version": 173, + "versionNonce": 1419536322, + "isDeleted": false, + "id": "3rQUCF_cfcfkQxiNThC-5", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1362.2352962992436, + "y": 233.25, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 179, + "height": 67, + "seed": 77019358, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "uy1gH9F-x9RKgjDft7m-p" + } + ], + "updated": 1709957997854, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 146, + "versionNonce": 838737794, + "isDeleted": false, + "id": "uy1gH9F-x9RKgjDft7m-p", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1393.1415462992436, + "y": 242.75, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 117.1875, + "height": 48, + "seed": 1295543582, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Hitchikers\nService", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "3rQUCF_cfcfkQxiNThC-5", + "originalText": "Hitchikers\nService", + "lineHeight": 1.2 + }, + { + "type": "line", + "version": 215, + "versionNonce": 835903298, + "isDeleted": false, + "id": "P5Z-Tn02VXsJnNYm0EaZd", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1182.2352962992436, + "y": 312.25, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 1, + "height": 419, + "seed": 204020062, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 1, + 419 + ] + ] + }, + { + "type": "line", + "version": 280, + "versionNonce": 544843522, + "isDeleted": false, + "id": "m8vw9oSY5tqObC4mxRfuW", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1713.4785553981178, + "y": 315.73996176640503, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 5, + "height": 411, + "seed": 951729566, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 5, + 411 + ] + ] + }, + { + "type": "line", + "version": 260, + "versionNonce": 1718712002, + "isDeleted": false, + "id": "aQNhr1D9KLt3VTS4ggQK1", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1453.4785553981178, + "y": 304.73996176640503, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 428, + "seed": 941414878, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 428 + ] + ] + }, + { + "type": "arrow", + "version": 187, + "versionNonce": 2046981598, + "isDeleted": false, + "id": "LLFDeW5JLaIjVX8y8-XD_", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1712.2352962992436, + "y": 337.25, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 254, + "height": 0, + "seed": 560066078, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "type": "text", + "id": "XK_DerYbhKdCZd5uZKCz7" + } + ], + "updated": 1709958057434, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -254, + 0 + ] + ] + }, + { + "type": "text", + "version": 36, + "versionNonce": 1826707330, + "isDeleted": false, + "id": "XK_DerYbhKdCZd5uZKCz7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1561.7977962992436, + "y": 313.25, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 46.875, + "height": 48, + "seed": 24803934, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709958056386, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "End\nRide", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "LLFDeW5JLaIjVX8y8-XD_", + "originalText": "End\nRide", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 146, + "versionNonce": 979178690, + "isDeleted": false, + "id": "3sQKk3tER7iDp8a3ermkO", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1362.2352962992436, + "y": 165.25, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 175.78125, + "height": 24, + "seed": 1421623710, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709957997854, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Ride Completion", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Ride Completion", + "lineHeight": 1.2 + }, + { + "id": "2lxHylOaYf2H38G91enbB", + "type": "text", + "x": 958, + "y": 205, + "width": 70.3125, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 132059358, + "version": 7, + "versionNonce": 430639746, + "isDeleted": false, + "boundElements": null, + "updated": 1709958011120, + "link": null, + "locked": false, + "text": "Driver", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Driver", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 133, + "versionNonce": 339014430, + "isDeleted": false, + "id": "p85hjD4e_NI53dKoH67VF", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 430.84375, + "y": 194, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 58.59375, + "height": 24, + "seed": 1361199554, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709958041712, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Rider", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Rider", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 44, + "versionNonce": 1547972802, + "isDeleted": false, + "id": "iz8mt2OjXTPvILaTiZEXN", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1669.84375, + "y": 203, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 70.3125, + "height": 24, + "seed": 1642965442, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709958027685, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Driver", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Driver", + "lineHeight": 1.2 + }, + { + "type": "text", + "version": 204, + "versionNonce": 1580565790, + "isDeleted": false, + "id": "yZKpjZqS2vkFtmR5yCUUJ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1153.703125, + "y": 193, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 58.59375, + "height": 24, + "seed": 1657619358, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709958046109, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Rider", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Rider", + "lineHeight": 1.2 + }, + { + "id": "GrJJbNJO8rwxgcM4Zx_cx", + "type": "arrow", + "x": 1443, + "y": 361, + "width": 63, + "height": 109, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1820398466, + "version": 174, + "versionNonce": 69648578, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "l4EUZGfTjiypCk3rWnRJ3" + } + ], + "updated": 1709958130207, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -56, + 55 + ], + [ + 7, + 109 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "l4EUZGfTjiypCk3rWnRJ3", + "type": "text", + "x": 1339.265625, + "y": 355, + "width": 105.46875, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 335430786, + "version": 21, + "versionNonce": 356097502, + "isDeleted": false, + "boundElements": null, + "updated": 1709958093637, + "link": null, + "locked": false, + "text": "Calculate\nFee", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "GrJJbNJO8rwxgcM4Zx_cx", + "originalText": "Calculate\nFee", + "lineHeight": 1.2 + }, + { + "id": "u1uQn7YP7caRnuJZ1VuWy", + "type": "arrow", + "x": 1455, + "y": 486, + "width": 272, + "height": 2, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 534912194, + "version": 77, + "versionNonce": 2027920514, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "0XyfKyvVvxFo0OAe9S0KY" + } + ], + "updated": 1709958127002, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -272, + 2 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "0XyfKyvVvxFo0OAe9S0KY", + "type": "text", + "x": 1275.984375, + "y": 442, + "width": 82.03125, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 918133662, + "version": 18, + "versionNonce": 1848920258, + "isDeleted": false, + "boundElements": null, + "updated": 1709958124187, + "link": null, + "locked": false, + "text": "Show\nReceipt", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "u1uQn7YP7caRnuJZ1VuWy", + "originalText": "Show\nReceipt", + "lineHeight": 1.2 + }, + { + "id": "ZbBevtVtjX60KqM5WGZpy", + "type": "arrow", + "x": 1184, + "y": 557, + "width": 274, + "height": 3, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 56293442, + "version": 71, + "versionNonce": 1848480130, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "rSbhg6j8A1i5PBinMOU49" + } + ], + "updated": 1709958158478, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 274, + 3 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "rSbhg6j8A1i5PBinMOU49", + "type": "text", + "x": 1279.984375, + "y": 534.5, + "width": 82.03125, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1772758658, + "version": 18, + "versionNonce": 1290170142, + "isDeleted": false, + "boundElements": null, + "updated": 1709958157557, + "link": null, + "locked": false, + "text": "Make\nPayment", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "ZbBevtVtjX60KqM5WGZpy", + "originalText": "Make\nPayment", + "lineHeight": 1.2 + }, + { + "type": "arrow", + "version": 113, + "versionNonce": 461637342, + "isDeleted": false, + "id": "aKfYOuNdpo9bEsQ1-lmks", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1455.6393724702298, + "y": 615.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 261, + "height": 3, + "seed": 16707614, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "type": "text", + "id": "BrCTrmppdCGZ283OPrswD" + } + ], + "updated": 1709958166944, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 261, + 3 + ] + ] + }, + { + "type": "text", + "version": 20, + "versionNonce": 1547681922, + "isDeleted": false, + "id": "BrCTrmppdCGZ283OPrswD", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1456.6237474702298, + "y": 609, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 82.03125, + "height": 48, + "seed": 707214430, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1709958162221, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Make\nPayment", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "aKfYOuNdpo9bEsQ1-lmks", + "originalText": "Make\nPayment", + "lineHeight": 1.2 + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/features-of-the-system.svg b/assets/img/high-level-design/features-of-the-system.svg new file mode 100644 index 0000000..15f1cdf --- /dev/null +++ b/assets/img/high-level-design/features-of-the-system.svg @@ -0,0 +1,21 @@ + + + + + + + + HitchikersServiceReady toPickupRequestRideMatchNotifyNotifyStartRideRideStartedRide InitiationHitchikersServiceEndRideRide CompletionDriverRiderDriverRiderCalculateFeeShowReceiptMakePaymentMakePayment \ No newline at end of file diff --git a/assets/img/high-level-design/gslb.excalidraw b/assets/img/high-level-design/gslb.excalidraw new file mode 100644 index 0000000..7d6eac6 --- /dev/null +++ b/assets/img/high-level-design/gslb.excalidraw @@ -0,0 +1,1115 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "UXXIjXc_n_COIAEuc1jYN", + "type": "rectangle", + "x": 684, + "y": 190, + "width": 223, + "height": 112, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 2014098050, + "version": 74, + "versionNonce": 787011166, + "isDeleted": false, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "id": "b1zMfjz4PQHrPV-s4p0-7", + "type": "text", + "x": 718, + "y": 157, + "width": 152.34375, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 14942622, + "version": 66, + "versionNonce": 954508866, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521299, + "link": null, + "locked": false, + "text": "Data Center 1", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Data Center 1", + "lineHeight": 1.2 + }, + { + "id": "u_fpMgzqjvecWHlYWKPrC", + "type": "rectangle", + "x": 719, + "y": 215, + "width": 25, + "height": 51, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 987346946, + "version": 51, + "versionNonce": 835802846, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 72, + "versionNonce": 263884290, + "isDeleted": false, + "id": "tERS0--bJs9ZnKBHETKHg", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 767.5, + "y": 214.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1611708254, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 81, + "versionNonce": 1233046302, + "isDeleted": false, + "id": "nW3fgXHv1VWxjcdrHw2sX", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 811.5, + "y": 215.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1362425822, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 130, + "versionNonce": 721255874, + "isDeleted": false, + "id": "6CaGLJT53ySRB_5xhbJOP", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 682.5, + "y": 367.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 223, + "height": 112, + "seed": 1120187458, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "cqyUwqpFGFzJ8JEENsYF9", + "type": "arrow" + } + ], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 125, + "versionNonce": 737392478, + "isDeleted": false, + "id": "whwYbbjqS_cr2QyRaog6H", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 716.5, + "y": 334.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 152.34375, + "height": 24, + "seed": 1047059458, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Data Center 2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Data Center 2", + "lineHeight": 1.2 + }, + { + "type": "rectangle", + "version": 111, + "versionNonce": 130839938, + "isDeleted": false, + "id": "b-SahyMhODF-7BeWDnxpP", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 717.5, + "y": 392.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1253182402, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 129, + "versionNonce": 25238430, + "isDeleted": false, + "id": "WBYNT8N_6EzaPJHzpVdjR", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 766, + "y": 392, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 499073922, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 138, + "versionNonce": 661026114, + "isDeleted": false, + "id": "JVipqtVoq5gkVj1pei3rw", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 810, + "y": 393, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1372554050, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "id": "bDS5FXYbGUCrp31TkxIrV", + "type": "rectangle", + "x": 657, + "y": 220, + "width": 46, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 1866839234, + "version": 56, + "versionNonce": 1079506910, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "AtdonHPaYF-8vBMucveaY" + }, + { + "id": "6zpHVOOkHpHxV__KWtg51", + "type": "arrow" + } + ], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "id": "AtdonHPaYF-8vBMucveaY", + "type": "text", + "x": 668.28125, + "y": 232, + "width": 23.4375, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1521528094, + "version": 27, + "versionNonce": 187333890, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521299, + "link": null, + "locked": false, + "text": "LB", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "bDS5FXYbGUCrp31TkxIrV", + "originalText": "LB", + "lineHeight": 1.2 + }, + { + "type": "rectangle", + "version": 88, + "versionNonce": 223126658, + "isDeleted": false, + "id": "N4uuFo8_7y4r0BI1F8SA6", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 653, + "y": 398, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "width": 46, + "height": 48, + "seed": 1790196866, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "ZKMCCztN3sDELsKEDts63" + } + ], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 60, + "versionNonce": 916413598, + "isDeleted": false, + "id": "ZKMCCztN3sDELsKEDts63", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 664.28125, + "y": 410, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "width": 23.4375, + "height": 24, + "seed": 129850434, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "LB", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "N4uuFo8_7y4r0BI1F8SA6", + "originalText": "LB", + "lineHeight": 1.2 + }, + { + "type": "ellipse", + "version": 243, + "versionNonce": 185123906, + "isDeleted": false, + "id": "Mgfgq2GStWm5K9_Hu7hQ1", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 249.47876139779578, + "y": 195.2757201212412, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 671377986, + "groupIds": [ + "mVaVlfYSHkZ0nX0i84oWy" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "qYL2-591_spAbU_-_Mwto", + "type": "arrow" + }, + { + "id": "6zpHVOOkHpHxV__KWtg51", + "type": "arrow" + } + ], + "updated": 1710028521299, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 265, + "versionNonce": 436607198, + "isDeleted": false, + "id": "O2HnYrR1TRGR5cUOzU8PK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 259.4787613977958, + "y": 216.2757201212412, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 239194626, + "groupIds": [ + "mVaVlfYSHkZ0nX0i84oWy" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 225, + "versionNonce": 61533186, + "isDeleted": false, + "id": "J5PAOCdg1ITi-HxTnOADd", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 259.4787613977958, + "y": 266.2757201212412, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 1610367426, + "groupIds": [ + "mVaVlfYSHkZ0nX0i84oWy" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 234, + "versionNonce": 298536222, + "isDeleted": false, + "id": "RE1GcFjSeLbuyyqmnHBGB", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 260.4787613977958, + "y": 272.2757201212412, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 944945538, + "groupIds": [ + "mVaVlfYSHkZ0nX0i84oWy" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 233, + "versionNonce": 1026694082, + "isDeleted": false, + "id": "088NiyMSOQp1djmt55e4u", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 260.4787613977958, + "y": 237.2757201212412, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 1988376898, + "groupIds": [ + "mVaVlfYSHkZ0nX0i84oWy" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 231, + "versionNonce": 2069887326, + "isDeleted": false, + "id": "TC1kYBfIdzvqJ6mynJdAx", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 258.4787613977958, + "y": 237.2757201212412, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 972013826, + "groupIds": [ + "mVaVlfYSHkZ0nX0i84oWy" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710028521299, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "id": "S4wfKl2a6-g1nznNTOju-", + "type": "rectangle", + "x": 497, + "y": 285, + "width": 86, + "height": 91, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 409129758, + "version": 78, + "versionNonce": 1706906498, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "z99k6w_yhbpbqmhxbEZsj" + }, + { + "id": "qYL2-591_spAbU_-_Mwto", + "type": "arrow" + }, + { + "id": "4qU48_a-GfM2OSLcfJ-PG", + "type": "arrow" + } + ], + "updated": 1710028521300, + "link": null, + "locked": false + }, + { + "id": "z99k6w_yhbpbqmhxbEZsj", + "type": "text", + "x": 516.5625, + "y": 318.5, + "width": 46.875, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 593963934, + "version": 47, + "versionNonce": 1578442142, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521300, + "link": null, + "locked": false, + "text": "GSLB", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "S4wfKl2a6-g1nznNTOju-", + "originalText": "GSLB", + "lineHeight": 1.2 + }, + { + "id": "qYL2-591_spAbU_-_Mwto", + "type": "arrow", + "x": 284.304796555187, + "y": 209.46276308505884, + "width": 196.695203444813, + "height": 75.21946527657371, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1359819166, + "version": 265, + "versionNonce": 260805442, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "irUzr7fgybSFGb6e4hduT" + } + ], + "updated": 1710028521300, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 196.695203444813, + 75.21946527657371 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "Mgfgq2GStWm5K9_Hu7hQ1", + "focus": -0.5246475721126254, + "gap": 12.970914572638272 + }, + "endBinding": { + "elementId": "S4wfKl2a6-g1nznNTOju-", + "focus": 0.37542364141380685, + "gap": 16 + }, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "irUzr7fgybSFGb6e4hduT", + "type": "text", + "x": 309.65239827759353, + "y": 255.4724957233457, + "width": 150, + "height": 19.2, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 353714078, + "version": 35, + "versionNonce": 790346206, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521300, + "link": null, + "locked": false, + "text": "Step 1 - xyz.com", + "fontSize": 16, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "qYL2-591_spAbU_-_Mwto", + "originalText": "Step 1 - xyz.com", + "lineHeight": 1.2 + }, + { + "id": "4qU48_a-GfM2OSLcfJ-PG", + "type": "arrow", + "x": 484, + "y": 366, + "width": 204, + "height": 102, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 265266590, + "version": 97, + "versionNonce": 754703106, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "JwWyS8OxK-jcZ1RlY9U2l" + } + ], + "updated": 1710028521300, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -204, + -102 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "S4wfKl2a6-g1nznNTOju-", + "focus": -0.9477611940298507, + "gap": 13 + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "JwWyS8OxK-jcZ1RlY9U2l", + "type": "text", + "x": 307, + "y": 305.4, + "width": 150, + "height": 19.2, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 501470594, + "version": 30, + "versionNonce": 361025054, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521300, + "link": null, + "locked": false, + "text": "Step 2 - 1.2.3.4", + "fontSize": 16, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "4qU48_a-GfM2OSLcfJ-PG", + "originalText": "Step 2 - 1.2.3.4", + "lineHeight": 1.2 + }, + { + "id": "6zpHVOOkHpHxV__KWtg51", + "type": "arrow", + "x": 271, + "y": 192, + "width": 384, + "height": 36, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1026633090, + "version": 67, + "versionNonce": 210578114, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "hCcZGxNhHAvtx89bfzUB-" + } + ], + "updated": 1710028521300, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 384, + 36 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "Mgfgq2GStWm5K9_Hu7hQ1", + "focus": -1.3651383659990217, + "gap": 6.810798118321909 + }, + "endBinding": { + "elementId": "bDS5FXYbGUCrp31TkxIrV", + "focus": 0.5221027479091996, + "gap": 2 + }, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "hCcZGxNhHAvtx89bfzUB-", + "type": "text", + "x": 388, + "y": 200.4, + "width": 150, + "height": 19.2, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 464597662, + "version": 26, + "versionNonce": 776034910, + "isDeleted": false, + "boundElements": null, + "updated": 1710028521300, + "link": null, + "locked": false, + "text": "Step 3 - 1.2.3.4", + "fontSize": 16, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "6zpHVOOkHpHxV__KWtg51", + "originalText": "Step 3 - 1.2.3.4", + "lineHeight": 1.2 + }, + { + "id": "ycbPVXn1yfZCwutWotv7F", + "type": "text", + "x": 647, + "y": 269, + "width": 65.625, + "height": 19.2, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1415910878, + "version": 211, + "versionNonce": 1464588062, + "isDeleted": false, + "boundElements": null, + "updated": 1710028527203, + "link": null, + "locked": false, + "text": "1.2.3.4", + "fontSize": 16, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1.2.3.4", + "lineHeight": 1.2 + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/gslb.svg b/assets/img/high-level-design/gslb.svg new file mode 100644 index 0000000..dbbf989 --- /dev/null +++ b/assets/img/high-level-design/gslb.svg @@ -0,0 +1,21 @@ + + + + + + + + Data Center 1Data Center 2LBLBGSLBStep 1 - xyz.comStep 2 - 1.2.3.4Step 3 - 1.2.3.41.2.3.4 \ No newline at end of file diff --git a/assets/img/high-level-design/hw-and-sw-lb-disadvantage.excalidraw b/assets/img/high-level-design/hw-and-sw-lb-disadvantage.excalidraw new file mode 100644 index 0000000..0559ea4 --- /dev/null +++ b/assets/img/high-level-design/hw-and-sw-lb-disadvantage.excalidraw @@ -0,0 +1,449 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "type": "rectangle", + "version": 75, + "versionNonce": 1172515138, + "isDeleted": false, + "id": "UXXIjXc_n_COIAEuc1jYN", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 684, + "y": 190, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 223, + "height": 112, + "seed": 2014098050, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 67, + "versionNonce": 1546683358, + "isDeleted": false, + "id": "b1zMfjz4PQHrPV-s4p0-7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 718, + "y": 157, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 152.34375, + "height": 24, + "seed": 14942622, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Data Center 1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Data Center 1", + "lineHeight": 1.2 + }, + { + "type": "rectangle", + "version": 52, + "versionNonce": 535867650, + "isDeleted": false, + "id": "u_fpMgzqjvecWHlYWKPrC", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 719, + "y": 215, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 987346946, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 73, + "versionNonce": 426792990, + "isDeleted": false, + "id": "tERS0--bJs9ZnKBHETKHg", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 767.5, + "y": 214.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1611708254, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 82, + "versionNonce": 913785026, + "isDeleted": false, + "id": "nW3fgXHv1VWxjcdrHw2sX", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 811.5, + "y": 215.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1362425822, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 131, + "versionNonce": 2860126, + "isDeleted": false, + "id": "6CaGLJT53ySRB_5xhbJOP", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 682.5, + "y": 367.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 223, + "height": 112, + "seed": 1120187458, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "cqyUwqpFGFzJ8JEENsYF9", + "type": "arrow" + } + ], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 126, + "versionNonce": 622188674, + "isDeleted": false, + "id": "whwYbbjqS_cr2QyRaog6H", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 716.5, + "y": 334.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 152.34375, + "height": 24, + "seed": 1047059458, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "Data Center 2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Data Center 2", + "lineHeight": 1.2 + }, + { + "type": "rectangle", + "version": 112, + "versionNonce": 1110665374, + "isDeleted": false, + "id": "b-SahyMhODF-7BeWDnxpP", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 717.5, + "y": 392.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1253182402, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "cqyUwqpFGFzJ8JEENsYF9", + "type": "arrow" + } + ], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 130, + "versionNonce": 998996034, + "isDeleted": false, + "id": "WBYNT8N_6EzaPJHzpVdjR", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 766, + "y": 392, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 499073922, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 139, + "versionNonce": 792241374, + "isDeleted": false, + "id": "JVipqtVoq5gkVj1pei3rw", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 810, + "y": 393, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 25, + "height": 51, + "seed": 1372554050, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 57, + "versionNonce": 2042044418, + "isDeleted": false, + "id": "bDS5FXYbGUCrp31TkxIrV", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 657, + "y": 220, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "width": 46, + "height": 48, + "seed": 1866839234, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "AtdonHPaYF-8vBMucveaY" + }, + { + "id": "cqyUwqpFGFzJ8JEENsYF9", + "type": "arrow" + } + ], + "updated": 1710028779368, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 28, + "versionNonce": 2026658078, + "isDeleted": false, + "id": "AtdonHPaYF-8vBMucveaY", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 668.28125, + "y": 232, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffffff", + "width": 23.4375, + "height": 24, + "seed": 1521528094, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "LB", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "bDS5FXYbGUCrp31TkxIrV", + "originalText": "LB", + "lineHeight": 1.2 + }, + { + "type": "arrow", + "version": 109, + "versionNonce": 194498498, + "isDeleted": false, + "id": "cqyUwqpFGFzJ8JEENsYF9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 671, + "y": 277, + "strokeColor": "#e03131", + "backgroundColor": "#ffffff", + "width": 35, + "height": 128, + "seed": 1136117406, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710028779368, + "link": null, + "locked": false, + "startBinding": { + "elementId": "bDS5FXYbGUCrp31TkxIrV", + "focus": 0.6096723044397463, + "gap": 9 + }, + "endBinding": { + "elementId": "b-SahyMhODF-7BeWDnxpP", + "focus": -1.049949849548646, + "gap": 11.5 + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 35, + 128 + ] + ] + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/hw-and-sw-lb-disadvantage.svg b/assets/img/high-level-design/hw-and-sw-lb-disadvantage.svg new file mode 100644 index 0000000..61f2659 --- /dev/null +++ b/assets/img/high-level-design/hw-and-sw-lb-disadvantage.svg @@ -0,0 +1,21 @@ + + + + + + + + Data Center 1Data Center 2LB \ No newline at end of file diff --git a/assets/img/high-level-design/lambda-architecture.excalidraw b/assets/img/high-level-design/lambda-architecture.excalidraw new file mode 100644 index 0000000..097e56f --- /dev/null +++ b/assets/img/high-level-design/lambda-architecture.excalidraw @@ -0,0 +1,965 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "1OCcmpRxloJ6DgPF0vyNU", + "type": "rectangle", + "x": 384, + "y": 147, + "width": 350, + "height": 106, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 864043377, + "version": 30, + "versionNonce": 330994769, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612937, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 55, + "versionNonce": 170153279, + "isDeleted": false, + "id": "8FhIsSxIJlimcgB4RlKYq", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 387, + "y": 333, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 350, + "height": 106, + "seed": 1966809439, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710176612937, + "link": null, + "locked": false + }, + { + "id": "oHO2eTWTvJ-tVFJnoN2bJ", + "type": "rectangle", + "x": 405, + "y": 170, + "width": 55, + "height": 61, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 861559199, + "version": 33, + "versionNonce": 2055558495, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "7dZKI556nnlJfsD4MMMkT" + }, + { + "id": "YbCMBzhJHdU51pSwhoRdF", + "type": "arrow" + } + ], + "updated": 1710176612937, + "link": null, + "locked": false + }, + { + "id": "7dZKI556nnlJfsD4MMMkT", + "type": "text", + "x": 412.88001251220703, + "y": 188, + "width": 39.23997497558594, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 810034463, + "version": 8, + "versionNonce": 1529639953, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612937, + "link": null, + "locked": false, + "text": "DFS", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "oHO2eTWTvJ-tVFJnoN2bJ", + "originalText": "DFS", + "lineHeight": 1.25 + }, + { + "id": "rNWzJ1bQ9jxPVRC1qcs1w", + "type": "rectangle", + "x": 408, + "y": 356, + "width": 137, + "height": 60, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 928640511, + "version": 58, + "versionNonce": 1141187167, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "s9xjJF4xIe1dfzc96sCS5" + }, + { + "id": "6VAkpFxlGmvrRIaHgXfbV", + "type": "arrow" + } + ], + "updated": 1710176636972, + "link": null, + "locked": false + }, + { + "id": "s9xjJF4xIe1dfzc96sCS5", + "type": "text", + "x": 435.36004638671875, + "y": 361, + "width": 82.2799072265625, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 880530431, + "version": 20, + "versionNonce": 1585826289, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612937, + "link": null, + "locked": false, + "text": "Message\nBroker", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "rNWzJ1bQ9jxPVRC1qcs1w", + "originalText": "Message\nBroker", + "lineHeight": 1.25 + }, + { + "id": "Qe-iZkdxkm5IFmst6BQQY", + "type": "ellipse", + "x": 631, + "y": 164, + "width": 81.99999999999997, + "height": 69, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2049677503, + "version": 105, + "versionNonce": 1006199761, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "zhgtKfozjEB4rIo1zvS0q" + }, + { + "id": "sudHYj04fhVRVygtlmUZO", + "type": "arrow" + } + ], + "updated": 1710176612937, + "link": null, + "locked": false + }, + { + "id": "zhgtKfozjEB4rIo1zvS0q", + "type": "text", + "x": 653.0886390611953, + "y": 186.1048160490641, + "width": 37.8399658203125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1811765489, + "version": 66, + "versionNonce": 630996415, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612937, + "link": null, + "locked": false, + "text": "View", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Qe-iZkdxkm5IFmst6BQQY", + "originalText": "View", + "lineHeight": 1.25 + }, + { + "type": "ellipse", + "version": 142, + "versionNonce": 1067368881, + "isDeleted": false, + "id": "vjKvs7iTUn04a1o6fvV5E", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 635, + "y": 352.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 81.99999999999997, + "height": 69, + "seed": 449226303, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "type": "text", + "id": "daxPz2pLLGqFsE07VZ4k-" + }, + { + "id": "6VAkpFxlGmvrRIaHgXfbV", + "type": "arrow" + }, + { + "id": "SL6m5RK9xRUWfj4_UDFBm", + "type": "arrow" + } + ], + "updated": 1710176612937, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 103, + "versionNonce": 535571935, + "isDeleted": false, + "id": "daxPz2pLLGqFsE07VZ4k-", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 657.0886390611953, + "y": 374.6048160490641, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 37.8399658203125, + "height": 25, + "seed": 1152747103, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710176612938, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "View", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "vjKvs7iTUn04a1o6fvV5E", + "originalText": "View", + "lineHeight": 1.25 + }, + { + "id": "YbCMBzhJHdU51pSwhoRdF", + "type": "arrow", + "x": 461, + "y": 200, + "width": 172, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1490124639, + "version": 201, + "versionNonce": 1184719761, + "isDeleted": false, + "boundElements": [], + "updated": 1710176612938, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 172, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "oHO2eTWTvJ-tVFJnoN2bJ", + "focus": -0.01639344262295082, + "gap": 1 + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "id": "6VAkpFxlGmvrRIaHgXfbV", + "type": "arrow", + "x": 545, + "y": 386, + "width": 88, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 739439761, + "version": 45, + "versionNonce": 489474417, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612938, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 88, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "rNWzJ1bQ9jxPVRC1qcs1w", + "focus": 0, + "gap": 1 + }, + "endBinding": { + "elementId": "vjKvs7iTUn04a1o6fvV5E", + "focus": 0.028985507246376812, + "gap": 2.016107863236094 + }, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "id": "sj-m39OC5idw4UlmM4Ywy", + "type": "rectangle", + "x": 824, + "y": 156, + "width": 121.99999999999997, + "height": 288, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 2035621713, + "version": 124, + "versionNonce": 804487711, + "isDeleted": false, + "boundElements": [ + { + "id": "RuhZKd106tr5EVuwecksu", + "type": "arrow" + }, + { + "id": "sudHYj04fhVRVygtlmUZO", + "type": "arrow" + }, + { + "id": "SL6m5RK9xRUWfj4_UDFBm", + "type": "arrow" + } + ], + "updated": 1710176612938, + "link": null, + "locked": false + }, + { + "id": "lyjE6D0mONYuv5okp0Q3b", + "type": "text", + "x": 407, + "y": 302, + "width": 119.51986694335938, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1704234879, + "version": 32, + "versionNonce": 2073814591, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612938, + "link": null, + "locked": false, + "text": "Speed Layer", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Speed Layer", + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 109, + "versionNonce": 153870641, + "isDeleted": false, + "id": "rAucD1WceQoQ-_PYcsqjz", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 407.2400665283203, + "y": 115.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 123.33987426757812, + "height": 25, + "seed": 1351620447, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710176612938, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "Batch Layer", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Batch Layer", + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 220, + "versionNonce": 1491256927, + "isDeleted": false, + "id": "uOgkU2FqRKgqxvyLj_x9n", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 852.3300628662109, + "y": 186.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 65.89993286132812, + "height": 50, + "seed": 910941681, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710176612938, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "Serving\nLayer", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Serving\nLayer", + "lineHeight": 1.25 + }, + { + "id": "sudHYj04fhVRVygtlmUZO", + "type": "arrow", + "x": 715, + "y": 200, + "width": 105, + "height": 117, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1957875007, + "version": 68, + "versionNonce": 811242257, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612938, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 105, + 117 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "Qe-iZkdxkm5IFmst6BQQY", + "focus": -0.8107457140822503, + "gap": 2.0362277498507098 + }, + "endBinding": { + "elementId": "sj-m39OC5idw4UlmM4Ywy", + "focus": -0.42188974255290485, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "id": "SL6m5RK9xRUWfj4_UDFBm", + "type": "arrow", + "x": 718, + "y": 389.602196, + "width": 100, + "height": 74, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1645434175, + "version": 78, + "versionNonce": 110666367, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612938, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 100, + -74 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "vjKvs7iTUn04a1o6fvV5E", + "focus": 0.7331289930444519, + "gap": 1.1124726488678505 + }, + "endBinding": { + "elementId": "sj-m39OC5idw4UlmM4Ywy", + "focus": 0.17964367135455228, + "gap": 6 + }, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "id": "RuhZKd106tr5EVuwecksu", + "type": "arrow", + "x": 947, + "y": 318.6880364139432, + "width": 55, + "height": 1.6880364139432231, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1738381713, + "version": 146, + "versionNonce": 1184368881, + "isDeleted": false, + "boundElements": null, + "updated": 1710176612938, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 55, + -1.6880364139432231 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "sj-m39OC5idw4UlmM4Ywy", + "focus": 0.14115723095066343, + "gap": 1 + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "type": "arrow", + "version": 217, + "versionNonce": 1557001727, + "isDeleted": false, + "id": "r4LDeJQkIEfQ-uqNYv5nX", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 563.4277783017823, + "y": 390.3281457532928, + "strokeColor": "#2f9e44", + "backgroundColor": "#ffffff", + "width": 36.62343372214919, + "height": 19.720310465772638, + "seed": 226865791, + "groupIds": [ + "0XtP12tT20JbYfhhqkKxp" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710176636957, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + 11.268748837584363, + -19.720310465772638 + ], + [ + 36.62343372214919, + -8.451561628188273 + ] + ] + }, + { + "type": "arrow", + "version": 229, + "versionNonce": 1390002751, + "isDeleted": false, + "id": "tbcDfIGZMWHqUBrkswwUQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 600.4536673395595, + "y": 388.3158691751527, + "strokeColor": "#2f9e44", + "backgroundColor": "#ffffff", + "width": 33.00133588149707, + "height": 19.317855150144627, + "seed": 1548290719, + "groupIds": [ + "0XtP12tT20JbYfhhqkKxp" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710176636972, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + -16.903123256376546, + 19.317855150144627 + ], + [ + -33.00133588149707, + 6.8417403656762215 + ] + ] + }, + { + "type": "arrow", + "version": 282, + "versionNonce": 931028081, + "isDeleted": false, + "id": "a0XY26tTZ2eGo2mVj_vQq", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 532.0936391467952, + "y": 201.2707683548221, + "strokeColor": "#2f9e44", + "backgroundColor": "#ffffff", + "width": 36.62343372214919, + "height": 19.720310465772638, + "seed": 309929137, + "groupIds": [ + "Yi5N9VZ8-dP0Tt6o4sTvz" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710176645994, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + 11.268748837584363, + -19.720310465772638 + ], + [ + 36.62343372214919, + -8.451561628188273 + ] + ] + }, + { + "type": "arrow", + "version": 294, + "versionNonce": 1027319377, + "isDeleted": false, + "id": "LvJWMW4uClSiulZvcAiZL", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 569.1195281845725, + "y": 199.25849177668198, + "strokeColor": "#2f9e44", + "backgroundColor": "#ffffff", + "width": 33.00133588149707, + "height": 19.317855150144627, + "seed": 909338257, + "groupIds": [ + "Yi5N9VZ8-dP0Tt6o4sTvz" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710176645994, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + -16.903123256376546, + 19.317855150144627 + ], + [ + -33.00133588149707, + 6.8417403656762215 + ] + ] + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/lambda-architecture.svg b/assets/img/high-level-design/lambda-architecture.svg new file mode 100644 index 0000000..8e7dc7d --- /dev/null +++ b/assets/img/high-level-design/lambda-architecture.svg @@ -0,0 +1,21 @@ + + + + + + + + DFSMessageBrokerViewViewSpeed LayerBatch LayerServingLayer \ No newline at end of file diff --git a/assets/img/high-level-design/load-balancer-vs-api-gateway.png b/assets/img/high-level-design/load-balancer-vs-api-gateway.png new file mode 100644 index 0000000..2207ff0 Binary files /dev/null and b/assets/img/high-level-design/load-balancer-vs-api-gateway.png differ diff --git a/assets/img/high-level-design/load-balancing-microservices.excalidraw b/assets/img/high-level-design/load-balancing-microservices.excalidraw new file mode 100644 index 0000000..4443395 --- /dev/null +++ b/assets/img/high-level-design/load-balancing-microservices.excalidraw @@ -0,0 +1,1242 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "p76ZtehLXDYaYPgy57ZRG", + "type": "rectangle", + "x": 753, + "y": 407, + "width": 31, + "height": 56, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 1944850215, + "version": 56, + "versionNonce": 1731275815, + "isDeleted": false, + "boundElements": [ + { + "id": "HktgtQRc2CmCmbRJR_YjR", + "type": "arrow" + } + ], + "updated": 1710311921140, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 87, + "versionNonce": 1755778887, + "isDeleted": false, + "id": "P-fBKOaRr5fVivKcZJ6rQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 845.5, + "y": 403, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 1724863817, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710311921140, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 85, + "versionNonce": 62099047, + "isDeleted": false, + "id": "k40rGjU0Mpkw0e57GHh0B", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 798.5, + "y": 409, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 323966729, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710311921140, + "link": null, + "locked": false + }, + { + "id": "O8pnhh7uNvLF3pOKKPtgp", + "type": "rectangle", + "x": 640, + "y": 398, + "width": 52, + "height": 86, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 1105065033, + "version": 71, + "versionNonce": 1112815817, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "esdljNMYHW-InUdBno-br" + }, + { + "id": "HktgtQRc2CmCmbRJR_YjR", + "type": "arrow" + }, + { + "id": "TYrkACaEDcY-dsyLC6tDe", + "type": "arrow" + } + ], + "updated": 1710312024611, + "link": null, + "locked": false + }, + { + "id": "esdljNMYHW-InUdBno-br", + "type": "text", + "x": 654.28125, + "y": 429, + "width": 23.4375, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 785417225, + "version": 56, + "versionNonce": 607281319, + "isDeleted": false, + "boundElements": null, + "updated": 1710311921140, + "link": null, + "locked": false, + "text": "LB", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "O8pnhh7uNvLF3pOKKPtgp", + "originalText": "LB", + "lineHeight": 1.2 + }, + { + "id": "HktgtQRc2CmCmbRJR_YjR", + "type": "arrow", + "x": 692, + "y": 439.8973379328761, + "width": 45, + "height": 1.4009383710219936, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 424909543, + "version": 220, + "versionNonce": 1199658633, + "isDeleted": false, + "boundElements": null, + "updated": 1710311921186, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 45, + 1.4009383710219936 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "O8pnhh7uNvLF3pOKKPtgp", + "focus": -0.043645696365462665, + "gap": 1 + }, + "endBinding": { + "elementId": "u-4jZcFvPXUqqmKrNTIHE", + "focus": -0.12533377461999445, + "gap": 1 + }, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "id": "u-4jZcFvPXUqqmKrNTIHE", + "type": "rectangle", + "x": 738, + "y": 390, + "width": 156, + "height": 95.00000000000001, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 511094057, + "version": 128, + "versionNonce": 1833390567, + "isDeleted": false, + "boundElements": [ + { + "id": "HktgtQRc2CmCmbRJR_YjR", + "type": "arrow" + }, + { + "id": "3oHvvRYgMDYy5ycTn4p__", + "type": "arrow" + }, + { + "id": "SxLkOvAVf1VEl1A37dfAR", + "type": "arrow" + } + ], + "updated": 1710312017608, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 124, + "versionNonce": 19548361, + "isDeleted": false, + "id": "s6H_2RCtA8wQH_SYtt2h7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1118, + "y": 274.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 1332715337, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "BGT-pewaOiROgtyZgzfq2", + "type": "arrow" + } + ], + "updated": 1710311917454, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 163, + "versionNonce": 359684871, + "isDeleted": false, + "id": "5Xbtfnpy_hRN0gwU9gliR", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1209.5, + "y": 275.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 950980137, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710311954096, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 153, + "versionNonce": 723741321, + "isDeleted": false, + "id": "cs88_UmKvjmgToDTIAli4", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1163.5, + "y": 276.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 1407525129, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710311917454, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 139, + "versionNonce": 1443738153, + "isDeleted": false, + "id": "Ghf4YoX_OUZH-a5qsgZ3x", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1005, + "y": 265.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 52, + "height": 86, + "seed": 512628713, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "dGF-svLrXvTaXDpl-cixn" + }, + { + "id": "BGT-pewaOiROgtyZgzfq2", + "type": "arrow" + }, + { + "id": "3oHvvRYgMDYy5ycTn4p__", + "type": "arrow" + } + ], + "updated": 1710312009874, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 124, + "versionNonce": 1125063753, + "isDeleted": false, + "id": "dGF-svLrXvTaXDpl-cixn", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1019.28125, + "y": 296.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 23.4375, + "height": 24, + "seed": 1521824457, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710311917454, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "LB", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Ghf4YoX_OUZH-a5qsgZ3x", + "originalText": "LB", + "lineHeight": 1.2 + }, + { + "type": "arrow", + "version": 490, + "versionNonce": 47149895, + "isDeleted": false, + "id": "BGT-pewaOiROgtyZgzfq2", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1058, + "y": 307.25613951408525, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 44, + "height": 1.0816515442621153, + "seed": 2096492969, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710311946648, + "link": null, + "locked": false, + "startBinding": { + "elementId": "Ghf4YoX_OUZH-a5qsgZ3x", + "gap": 1, + "focus": -0.043645696365462665 + }, + "endBinding": { + "elementId": "znbSM2RW6ERSelrgg29fT", + "gap": 1, + "focus": -0.12533377461999445 + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + 44, + 1.0816515442621153 + ] + ] + }, + { + "type": "rectangle", + "version": 230, + "versionNonce": 690407687, + "isDeleted": false, + "id": "znbSM2RW6ERSelrgg29fT", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1103, + "y": 257.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 241, + "height": 95.00000000000001, + "seed": 1518064777, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "BGT-pewaOiROgtyZgzfq2", + "type": "arrow" + } + ], + "updated": 1710311946647, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 44, + "versionNonce": 908576745, + "isDeleted": false, + "id": "XNE-jd-TqyQXuBAtxcTTs", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1146, + "y": 532.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 650977513, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "rFJ7SmnVIlx-RoVk-rDeo", + "type": "arrow" + } + ], + "updated": 1710311925003, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 59, + "versionNonce": 10003049, + "isDeleted": false, + "id": "6bnfiSOAfWWZ6llCHJWx7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1033, + "y": 523.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 52, + "height": 86, + "seed": 541818249, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "tSkkEOqSNsOKgozCICPKz" + }, + { + "id": "rFJ7SmnVIlx-RoVk-rDeo", + "type": "arrow" + }, + { + "id": "SxLkOvAVf1VEl1A37dfAR", + "type": "arrow" + } + ], + "updated": 1710312015318, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 44, + "versionNonce": 2084834153, + "isDeleted": false, + "id": "tSkkEOqSNsOKgozCICPKz", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1047.28125, + "y": 554.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 23.4375, + "height": 24, + "seed": 922018921, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710311925003, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "LB", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "6bnfiSOAfWWZ6llCHJWx7", + "originalText": "LB", + "lineHeight": 1.2 + }, + { + "type": "arrow", + "version": 242, + "versionNonce": 346085767, + "isDeleted": false, + "id": "rFJ7SmnVIlx-RoVk-rDeo", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1086, + "y": 565.7834377367436, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 44, + "height": 1.9578376538195243, + "seed": 1166962505, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710311932737, + "link": null, + "locked": false, + "startBinding": { + "elementId": "6bnfiSOAfWWZ6llCHJWx7", + "gap": 1, + "focus": -0.043645696365462665 + }, + "endBinding": { + "elementId": "bH1K_t2yFo5p2eJDxyY77", + "gap": 1, + "focus": -0.12533377461999445 + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + 44, + 1.9578376538195243 + ] + ] + }, + { + "type": "rectangle", + "version": 142, + "versionNonce": 349224775, + "isDeleted": false, + "id": "bH1K_t2yFo5p2eJDxyY77", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1131, + "y": 515.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 60.00000000000001, + "height": 95.00000000000001, + "seed": 578002473, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "rFJ7SmnVIlx-RoVk-rDeo", + "type": "arrow" + } + ], + "updated": 1710311932736, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 172, + "versionNonce": 2008939081, + "isDeleted": false, + "id": "65_B2sRowMtlfx3zJYcCk", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1302.5, + "y": 274, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 136316297, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710311944482, + "link": null, + "locked": false + }, + { + "type": "rectangle", + "version": 180, + "versionNonce": 725888073, + "isDeleted": false, + "id": "fCgF8XJzRDQXqouz0zdit", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1258.5, + "y": 278, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 31, + "height": 56, + "seed": 400619337, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [], + "updated": 1710311942664, + "link": null, + "locked": false + }, + { + "id": "3oHvvRYgMDYy5ycTn4p__", + "type": "arrow", + "x": 899, + "y": 432, + "width": 102, + "height": 116, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1398572359, + "version": 68, + "versionNonce": 344194025, + "isDeleted": false, + "boundElements": null, + "updated": 1710312009874, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 102, + -116 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "u-4jZcFvPXUqqmKrNTIHE", + "focus": 0.652630821276902, + "gap": 5 + }, + "endBinding": { + "elementId": "Ghf4YoX_OUZH-a5qsgZ3x", + "focus": 0.36679275871386113, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "type": "arrow", + "version": 140, + "versionNonce": 391397767, + "isDeleted": false, + "id": "SxLkOvAVf1VEl1A37dfAR", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 896.0505365595221, + "y": 438.34703483350563, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 133.0000000000001, + "height": 133, + "seed": 44160583, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312017789, + "link": null, + "locked": false, + "startBinding": { + "elementId": "u-4jZcFvPXUqqmKrNTIHE", + "focus": -0.6311035994104899, + "gap": 2.050536559522129 + }, + "endBinding": { + "elementId": "6bnfiSOAfWWZ6llCHJWx7", + "focus": -0.5042970764345416, + "gap": 3.949463440477757 + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "triangle", + "points": [ + [ + 0, + 0 + ], + [ + 133.0000000000001, + 133 + ] + ] + }, + { + "id": "TYrkACaEDcY-dsyLC6tDe", + "type": "arrow", + "x": 502, + "y": 439, + "width": 134, + "height": 1, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 626576327, + "version": 55, + "versionNonce": 681224489, + "isDeleted": false, + "boundElements": null, + "updated": 1710312029233, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 134, + -1 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": { + "elementId": "O8pnhh7uNvLF3pOKKPtgp", + "focus": 0.07463718037318591, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "triangle" + }, + { + "type": "ellipse", + "version": 222, + "versionNonce": 663092679, + "isDeleted": false, + "id": "RAVQDSncmSo4MNFoUMdsQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 464.1277520675212, + "y": 394.98628092367204, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 22, + "height": 23, + "seed": 261403943, + "groupIds": [ + "_78nfUriGcxIZRV7-6A56" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312037607, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 246, + "versionNonce": 363900135, + "isDeleted": false, + "id": "HwVccMHbVZRjYsYuDsPEG", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 474.1277520675212, + "y": 415.98628092367204, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 49, + "seed": 766605383, + "groupIds": [ + "_78nfUriGcxIZRV7-6A56" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312037607, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49 + ] + ] + }, + { + "type": "line", + "version": 206, + "versionNonce": 1157134343, + "isDeleted": false, + "id": "PBZHWyM7H4JK_uYRG71Dy", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 474.1277520675212, + "y": 465.98628092367204, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15, + "height": 18, + "seed": 1837684583, + "groupIds": [ + "_78nfUriGcxIZRV7-6A56" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312037607, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -15, + 18 + ] + ] + }, + { + "type": "line", + "version": 215, + "versionNonce": 1264181031, + "isDeleted": false, + "id": "QJN97JyBH2iE1Iz56rLpX", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 475.1277520675212, + "y": 471.98628092367204, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 17, + "seed": 1969705607, + "groupIds": [ + "_78nfUriGcxIZRV7-6A56" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312037607, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 17 + ] + ] + }, + { + "type": "line", + "version": 214, + "versionNonce": 1262576199, + "isDeleted": false, + "id": "6dWePiXtj9AweSHDqr7mU", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 475.1277520675212, + "y": 436.98628092367204, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 15, + "seed": 1846110631, + "groupIds": [ + "_78nfUriGcxIZRV7-6A56" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312037607, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 16, + 15 + ] + ] + }, + { + "type": "line", + "version": 212, + "versionNonce": 1703641447, + "isDeleted": false, + "id": "0EOCyD9shjKiQ6JsmKVCQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 473.1277520675212, + "y": 436.98628092367204, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 16, + "height": 16, + "seed": 882613447, + "groupIds": [ + "_78nfUriGcxIZRV7-6A56" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710312037607, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + -16, + 16 + ] + ] + }, + { + "id": "Gc5oISZtMQl8Balg3XIze", + "type": "line", + "x": 573, + "y": 235, + "width": 1, + "height": 409, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 988290633, + "version": 63, + "versionNonce": 380029287, + "isDeleted": false, + "boundElements": null, + "updated": 1710312046490, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -1, + 409 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/load-balancing-microservices.png b/assets/img/high-level-design/load-balancing-microservices.png new file mode 100644 index 0000000..4dcd243 Binary files /dev/null and b/assets/img/high-level-design/load-balancing-microservices.png differ diff --git a/assets/img/high-level-design/merkle-tree.png b/assets/img/high-level-design/merkle-tree.png new file mode 100644 index 0000000..7815755 Binary files /dev/null and b/assets/img/high-level-design/merkle-tree.png differ diff --git a/assets/img/high-level-design/multi-tier-constraint.excalidraw b/assets/img/high-level-design/multi-tier-constraint.excalidraw new file mode 100644 index 0000000..d4e882b --- /dev/null +++ b/assets/img/high-level-design/multi-tier-constraint.excalidraw @@ -0,0 +1,443 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "1v0eZ3rjB5Wq4stkzHgaH", + "type": "rectangle", + "x": 333, + "y": 154, + "width": 123.00000000000001, + "height": 188, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 268692287, + "version": 34, + "versionNonce": 101966591, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "Hu4SIUpFxsXR1MiIwesBV" + }, + { + "id": "bCU7pbVUQgQ_pW3RzMwcJ", + "type": "arrow" + }, + { + "id": "h59C1k10-GIdE-Fg42l0n", + "type": "arrow" + } + ], + "updated": 1710165313869, + "link": null, + "locked": false + }, + { + "id": "Hu4SIUpFxsXR1MiIwesBV", + "type": "text", + "x": 359.34375, + "y": 236, + "width": 70.3125, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2119886303, + "version": 24, + "versionNonce": 52515807, + "isDeleted": false, + "boundElements": null, + "updated": 1710165341750, + "link": null, + "locked": false, + "text": "tier 1", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "1v0eZ3rjB5Wq4stkzHgaH", + "originalText": "tier 1", + "lineHeight": 1.2 + }, + { + "type": "rectangle", + "version": 75, + "versionNonce": 1110419217, + "isDeleted": false, + "id": "2SKasoWxTW2cEpRs-8PbY", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 574.5, + "y": 153, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 123.00000000000001, + "height": 188, + "seed": 1877396721, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "iUE6T4BwdjkFz-WKOtfMH" + }, + { + "id": "bCU7pbVUQgQ_pW3RzMwcJ", + "type": "arrow" + }, + { + "id": "KcpiOyt-TLOWM6HRm4pri", + "type": "arrow" + } + ], + "updated": 1710165296417, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 67, + "versionNonce": 1171654783, + "isDeleted": false, + "id": "iUE6T4BwdjkFz-WKOtfMH", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 600.84375, + "y": 235, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 70.3125, + "height": 24, + "seed": 541198033, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710165346222, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "tier 2", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "2SKasoWxTW2cEpRs-8PbY", + "originalText": "tier 2", + "lineHeight": 1.2 + }, + { + "type": "rectangle", + "version": 150, + "versionNonce": 726167359, + "isDeleted": false, + "id": "LVQRg06EkSBzz8Qx1MVvR", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 814.5, + "y": 153, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 123.00000000000001, + "height": 188, + "seed": 129709855, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "SgHDmMNo-NYkx7KwyrC8r" + }, + { + "id": "KcpiOyt-TLOWM6HRm4pri", + "type": "arrow" + }, + { + "id": "h59C1k10-GIdE-Fg42l0n", + "type": "arrow" + } + ], + "updated": 1710165313869, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 142, + "versionNonce": 1575344415, + "isDeleted": false, + "id": "SgHDmMNo-NYkx7KwyrC8r", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 840.84375, + "y": 235, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 70.3125, + "height": 24, + "seed": 1252326207, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1710165349783, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 3, + "text": "tier 3", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "LVQRg06EkSBzz8Qx1MVvR", + "originalText": "tier 3", + "lineHeight": 1.2 + }, + { + "id": "bCU7pbVUQgQ_pW3RzMwcJ", + "type": "arrow", + "x": 463, + "y": 244, + "width": 111, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 784148113, + "version": 39, + "versionNonce": 233390481, + "isDeleted": false, + "boundElements": null, + "updated": 1710165285784, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 111, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "1v0eZ3rjB5Wq4stkzHgaH", + "focus": -0.0425531914893617, + "gap": 7 + }, + "endBinding": { + "elementId": "2SKasoWxTW2cEpRs-8PbY", + "focus": 0.031914893617021274, + "gap": 1 + }, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "type": "arrow", + "version": 155, + "versionNonce": 1849825375, + "isDeleted": false, + "id": "KcpiOyt-TLOWM6HRm4pri", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 702.6937630655896, + "y": 245.69837785437568, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 108.30623693441044, + "height": 0, + "seed": 489449535, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1710165307237, + "link": null, + "locked": false, + "startBinding": { + "elementId": "2SKasoWxTW2cEpRs-8PbY", + "focus": -0.013847044102386399, + "gap": 5.193763065589565 + }, + "endBinding": { + "elementId": "LVQRg06EkSBzz8Qx1MVvR", + "focus": 0.013847044102386399, + "gap": 3.5 + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 108.30623693441044, + 0 + ] + ] + }, + { + "id": "h59C1k10-GIdE-Fg42l0n", + "type": "arrow", + "x": 418, + "y": 356, + "width": 446, + "height": 57, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1527060209, + "version": 141, + "versionNonce": 596270129, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "fGJh9EwrNEEqPPjJzGidW" + } + ], + "updated": 1710165329451, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 211, + 57 + ], + [ + 446, + 5 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "1v0eZ3rjB5Wq4stkzHgaH", + "focus": 0.9189785556674307, + "gap": 14 + }, + "endBinding": { + "elementId": "LVQRg06EkSBzz8Qx1MVvR", + "focus": -1.0347200253084465, + "gap": 20 + }, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "fGJh9EwrNEEqPPjJzGidW", + "type": "text", + "x": 618.453125, + "y": 391.4, + "width": 21.09375, + "height": 43.199999999999996, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1462920831, + "version": 5, + "versionNonce": 1942451999, + "isDeleted": false, + "boundElements": null, + "updated": 1710165328167, + "link": null, + "locked": false, + "text": "X", + "fontSize": 36, + "fontFamily": 3, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "h59C1k10-GIdE-Fg42l0n", + "originalText": "X", + "lineHeight": 1.2 + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/multi-tier-constraint.svg b/assets/img/high-level-design/multi-tier-constraint.svg new file mode 100644 index 0000000..d8a0ee4 --- /dev/null +++ b/assets/img/high-level-design/multi-tier-constraint.svg @@ -0,0 +1,21 @@ + + + + + + + + tier 1tier 2tier 3X \ No newline at end of file diff --git a/assets/img/high-level-design/percentile-distribution-response-time.excalidraw b/assets/img/high-level-design/percentile-distribution-response-time.excalidraw new file mode 100644 index 0000000..fcd9a9e --- /dev/null +++ b/assets/img/high-level-design/percentile-distribution-response-time.excalidraw @@ -0,0 +1,1194 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "JdoxsrWvNODNLpMMTC5uB", + "type": "arrow", + "x": 413, + "y": 451, + "width": 590, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2110908610, + "version": 121, + "versionNonce": 1359093854, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123932, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 590, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "h6afKQg60_TUv760egqOh", + "type": "arrow", + "x": 409, + "y": 452, + "width": 0, + "height": 333, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2097272414, + "version": 104, + "versionNonce": 1341463682, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123932, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + -333 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow" + }, + { + "id": "eGOuo0bWAAi7fmHaAv91w", + "type": "text", + "x": 277, + "y": 258, + "width": 105.46875, + "height": 48, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 207665346, + "version": 79, + "versionNonce": 718955678, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123932, + "link": null, + "locked": false, + "text": "Response\nTime (ms)", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Response\nTime (ms)", + "lineHeight": 1.2 + }, + { + "id": "5CmJv_sYMYz7VFEorhafN", + "type": "text", + "x": 626, + "y": 484, + "width": 117.1875, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 314589598, + "version": 60, + "versionNonce": 1373781058, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123932, + "link": null, + "locked": false, + "text": "Percentile", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Percentile", + "lineHeight": 1.2 + }, + { + "id": "bdJisICrQfqp4ltUJcpyj", + "type": "line", + "x": 412, + "y": 389, + "width": 557, + "height": 0, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 829407234, + "version": 110, + "versionNonce": 1175294914, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123932, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 557, + 0 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "type": "line", + "version": 147, + "versionNonce": 437780830, + "isDeleted": false, + "id": "U-4Pzx2USWRWSxEtMsASo", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 413.2104943774641, + "y": 348.47409456744793, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 557, + "height": 0, + "seed": 1786988738, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965123932, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 557, + 0 + ] + ] + }, + { + "type": "line", + "version": 163, + "versionNonce": 879348610, + "isDeleted": false, + "id": "Nge0FoeBIHyrgRt4NukeD", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 415.13378942236307, + "y": 320.8426383104471, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 557, + "height": 0, + "seed": 2083702594, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965123932, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 557, + 0 + ] + ] + }, + { + "type": "line", + "version": 206, + "versionNonce": 2022450590, + "isDeleted": false, + "id": "ubDRZfoHDGNzrr-phmyle", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 407.9821135781705, + "y": 225.50453187003728, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 557, + "height": 0, + "seed": 756894466, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965123932, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 557, + 0 + ] + ] + }, + { + "type": "line", + "version": 269, + "versionNonce": 1105791810, + "isDeleted": false, + "id": "atrswTiXD-Xk7eBFvOc6L", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 411.2570978663861, + "y": 175.07110832706093, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 557, + "height": 0, + "seed": 1711478110, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1709965123932, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 557, + 0 + ] + ] + }, + { + "id": "WEBogZk998fdvopmv9F_a", + "type": "freedraw", + "x": 472, + "y": 387, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2099526494, + "version": 14, + "versionNonce": 869914142, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "uCZfxwrZXApBJVoaKPZP7", + "type": "freedraw", + "x": 519, + "y": 378, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1809338398, + "version": 14, + "versionNonce": 1877779138, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "y_qBHrT9Ix9xbx6eZSGXX", + "type": "freedraw", + "x": 555, + "y": 375, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 975832862, + "version": 10, + "versionNonce": 1458234078, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "ozP1zcHn272j7jeYSy8Oj", + "type": "freedraw", + "x": 615, + "y": 350, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1602008030, + "version": 10, + "versionNonce": 198164994, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "2EMdBPfkXb5vvfMvcqF1b", + "type": "freedraw", + "x": 650, + "y": 345, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1420662942, + "version": 10, + "versionNonce": 730257182, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "B6q0en6zlWSei-K5e44Ac", + "type": "freedraw", + "x": 679, + "y": 349, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2127017310, + "version": 10, + "versionNonce": 1447834050, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "yFsk1w-hcYjlvQ_Bzfn3f", + "type": "freedraw", + "x": 709, + "y": 344, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 985008670, + "version": 10, + "versionNonce": 689587038, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "zQCmdl9wD1nW6wd2BkWRJ", + "type": "freedraw", + "x": 731, + "y": 320, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1766037214, + "version": 10, + "versionNonce": 1059367298, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "tx7cv_tWhyFwuU44lx4sV", + "type": "freedraw", + "x": 772, + "y": 269, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2065390494, + "version": 10, + "versionNonce": 838215582, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "sWQ1jR8AcR0feKE7ZpGDo", + "type": "freedraw", + "x": 816, + "y": 224, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2093504606, + "version": 10, + "versionNonce": 764946754, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "hSavKw3rhZ36G4ijvAYvY", + "type": "freedraw", + "x": 800, + "y": 257, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 582323486, + "version": 10, + "versionNonce": 910918622, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "FX-jyeopRjNS15fh_OoK6", + "type": "freedraw", + "x": 861, + "y": 176, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1429076446, + "version": 10, + "versionNonce": 1225109762, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "EZKyrUSQ6K59yqcB_Ccwb", + "type": "freedraw", + "x": 592, + "y": 359, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 862339906, + "version": 9, + "versionNonce": 1961640130, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "ebAQrymXDDrj_XM6AYfGt", + "type": "freedraw", + "x": 441, + "y": 396, + "width": 0.0001, + "height": 0.0001, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 4, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 880226946, + "version": 9, + "versionNonce": 1334223966, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0.0001, + 0.0001 + ] + ], + "pressures": [], + "simulatePressure": true, + "lastCommittedPoint": [ + 0.0001, + 0.0001 + ] + }, + { + "id": "TH-aJYYbte0UrvJxyaQi3", + "type": "line", + "x": 859, + "y": 181, + "width": 0, + "height": 269, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 347391646, + "version": 93, + "versionNonce": 1848535198, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 269 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "id": "sV-foxwcdA4uTTC8FQmEz", + "type": "line", + "x": 815, + "y": 226, + "width": 0, + "height": 220, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 921314434, + "version": 79, + "versionNonce": 549484610, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 220 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "id": "B_HN1pOxcc5mSH71wlGf3", + "type": "line", + "x": 440, + "y": 396, + "width": 0, + "height": 55, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 947239426, + "version": 23, + "versionNonce": 601072862, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 55 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "id": "G0MdFb6wtdBQk96q83qBE", + "type": "text", + "x": 842, + "y": 465, + "width": 35.15625, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 688419550, + "version": 25, + "versionNonce": 1884037122, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "text": "100", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "100", + "lineHeight": 1.2 + }, + { + "id": "gtaBLlrgbKBhyW-Q6dyhA", + "type": "text", + "x": 428, + "y": 468, + "width": 23.4375, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1138359454, + "version": 22, + "versionNonce": 570906910, + "isDeleted": false, + "boundElements": null, + "updated": 1709965123933, + "link": null, + "locked": false, + "text": "10", + "fontSize": 20, + "fontFamily": 3, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "10", + "lineHeight": 1.2 + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/high-level-design/percentile-distribution-response-time.svg b/assets/img/high-level-design/percentile-distribution-response-time.svg new file mode 100644 index 0000000..9378ba9 --- /dev/null +++ b/assets/img/high-level-design/percentile-distribution-response-time.svg @@ -0,0 +1,21 @@ + + + + + + + + ResponseTime (ms)Percentile10010 \ No newline at end of file diff --git a/assets/img/high-level-design/pipes-and-filters.png b/assets/img/high-level-design/pipes-and-filters.png new file mode 100644 index 0000000..aa7bcda Binary files /dev/null and b/assets/img/high-level-design/pipes-and-filters.png differ diff --git a/assets/img/high-level-design/rolling-deployment-pattern.png b/assets/img/high-level-design/rolling-deployment-pattern.png new file mode 100644 index 0000000..958bb74 Binary files /dev/null and b/assets/img/high-level-design/rolling-deployment-pattern.png differ diff --git a/assets/img/high-level-design/saga-pattern.png b/assets/img/high-level-design/saga-pattern.png new file mode 100644 index 0000000..846f289 Binary files /dev/null and b/assets/img/high-level-design/saga-pattern.png differ diff --git a/assets/img/high-level-design/scatter-gather.png b/assets/img/high-level-design/scatter-gather.png new file mode 100644 index 0000000..33d8a66 Binary files /dev/null and b/assets/img/high-level-design/scatter-gather.png differ diff --git a/assets/img/high-level-design/sidecar-pattern.png b/assets/img/high-level-design/sidecar-pattern.png new file mode 100644 index 0000000..468816b Binary files /dev/null and b/assets/img/high-level-design/sidecar-pattern.png differ diff --git a/assets/img/high-level-design/strangler-fig-pattern.png b/assets/img/high-level-design/strangler-fig-pattern.png new file mode 100644 index 0000000..9c64da5 Binary files /dev/null and b/assets/img/high-level-design/strangler-fig-pattern.png differ diff --git a/assets/img/high-level-design/testing-pyramid.png b/assets/img/high-level-design/testing-pyramid.png new file mode 100644 index 0000000..68c548d Binary files /dev/null and b/assets/img/high-level-design/testing-pyramid.png differ diff --git a/assets/img/high-level-design/transactional-outbox-pattern.png b/assets/img/high-level-design/transactional-outbox-pattern.png new file mode 100644 index 0000000..6ec19f3 Binary files /dev/null and b/assets/img/high-level-design/transactional-outbox-pattern.png differ diff --git a/assets/img/high-level-design/two-phase-commit.png b/assets/img/high-level-design/two-phase-commit.png new file mode 100644 index 0000000..734bd88 Binary files /dev/null and b/assets/img/high-level-design/two-phase-commit.png differ diff --git a/assets/img/java-multithreading/dining-philosophers.excalidraw b/assets/img/java-multithreading/dining-philosophers.excalidraw new file mode 100644 index 0000000..3565666 --- /dev/null +++ b/assets/img/java-multithreading/dining-philosophers.excalidraw @@ -0,0 +1,841 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "cs_6DTHMK2SxPV2dYmkQk", + "type": "ellipse", + "x": 438, + "y": 163, + "width": 250.00000000000006, + "height": 221, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 364070820, + "version": 103, + "versionNonce": 1264349468, + "isDeleted": false, + "boundElements": null, + "updated": 1716817569578, + "link": null, + "locked": false + }, + { + "id": "qF8VJ2g_CqL0iqN0L7v5L", + "type": "line", + "x": 551.7521228790283, + "y": 132, + "width": 0, + "height": 61, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 2073739684, + "version": 13, + "versionNonce": 406631964, + "isDeleted": false, + "boundElements": null, + "updated": 1716817565254, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 61 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "type": "line", + "version": 61, + "versionNonce": 530991524, + "isDeleted": false, + "id": "gwkH5gmGn1XOQmFpwAx0G", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 673.903362520847, + "y": 192.34473739972802, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 61, + "seed": 107342756, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817565254, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 61 + ] + ] + }, + { + "type": "line", + "version": 110, + "versionNonce": 1274516124, + "isDeleted": false, + "id": "XPnQnoD4JkgXJCHuwhcko", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 450.6288769917328, + "y": 197.37773350030182, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 61, + "seed": 1924300580, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817565254, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 61 + ] + ] + }, + { + "type": "line", + "version": 141, + "versionNonce": 2032041372, + "isDeleted": false, + "id": "rytlpTdhVCMCyGDp28sd_", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 489.4416297933801, + "y": 332.7607096391032, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 61, + "seed": 1602631324, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817577656, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 61 + ] + ] + }, + { + "type": "line", + "version": 124, + "versionNonce": 1978747164, + "isDeleted": false, + "id": "ixWoOXtPVqrfSfEb7d7yg", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 637.4416297933801, + "y": 337.7607096391032, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 0, + "height": 61, + "seed": 249087644, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817580784, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 61 + ] + ] + }, + { + "id": "Zz6A-J8y6DmDt7CdQSORp", + "type": "line", + "x": 494.7521228790283, + "y": 202, + "width": 14, + "height": 15, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "seed": 1140985252, + "version": 17, + "versionNonce": 274266652, + "isDeleted": false, + "boundElements": null, + "updated": 1716817590977, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 14, + 15 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": null + }, + { + "type": "line", + "version": 37, + "versionNonce": 2116062116, + "isDeleted": false, + "id": "wTE7E7g2HHr46IkZFUj0D", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 464.75946274660646, + "y": 292.43953734090434, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 14, + "height": 15, + "seed": 1209004324, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817594992, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 14, + 15 + ] + ] + }, + { + "type": "line", + "version": 37, + "versionNonce": 31456548, + "isDeleted": false, + "id": "QFr3bM9_OwAtoUAfm8_MD", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 552.7594627466065, + "y": 342.43953734090434, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 14, + "height": 15, + "seed": 614789796, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817599549, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 14, + 15 + ] + ] + }, + { + "type": "line", + "version": 38, + "versionNonce": 1677068836, + "isDeleted": false, + "id": "X1UBhpsPD44yq-Vp2o2rX", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 649.7594627466065, + "y": 294.43953734090434, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 14, + "height": 15, + "seed": 425067556, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817602862, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 14, + 15 + ] + ] + }, + { + "type": "line", + "version": 39, + "versionNonce": 1641403044, + "isDeleted": false, + "id": "4N37RSghCsBE5nhXrzQXV", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 610.7594627466065, + "y": 200.43953734090434, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 14, + "height": 15, + "seed": 218943780, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1716817605875, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 14, + 15 + ] + ] + }, + { + "id": "AEX6hXemF7erkvNXGHutf", + "type": "text", + "x": 569.7521228790283, + "y": 130, + "width": 13.759994506835938, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 21446940, + "version": 4, + "versionNonce": 116080932, + "isDeleted": false, + "boundElements": null, + "updated": 1716817989723, + "link": null, + "locked": false, + "text": "0", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "0", + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 28, + "versionNonce": 1229967644, + "isDeleted": false, + "id": "fsRolS0BSIcJub_RKT5MJ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 695.8721256256104, + "y": 206.5, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 5.4199981689453125, + "height": 25, + "seed": 1139516324, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1716817997599, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "lineHeight": 1.25, + "baseline": 18 + }, + { + "id": "Nunk_80XlArOBzH7rh1H8", + "type": "text", + "x": 659.7521228790283, + "y": 371, + "width": 14.239990234375, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1651194396, + "version": 2, + "versionNonce": 990490916, + "isDeleted": false, + "boundElements": null, + "updated": 1716818001077, + "link": null, + "locked": false, + "text": "2", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "2", + "lineHeight": 1.25 + }, + { + "id": "1sToIicA2pjht-yDSUVhj", + "type": "text", + "x": 507.7521228790283, + "y": 391, + "width": 13.619979858398438, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 849524772, + "version": 6, + "versionNonce": 571345060, + "isDeleted": false, + "boundElements": null, + "updated": 1716818005655, + "link": null, + "locked": false, + "text": "3", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "3", + "lineHeight": 1.25 + }, + { + "id": "I7YunP2KKyOLj2ATGH3yV", + "type": "text", + "x": 423.7521228790283, + "y": 211, + "width": 12.79998779296875, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 2132225956, + "version": 2, + "versionNonce": 620501276, + "isDeleted": false, + "boundElements": null, + "updated": 1716818010125, + "link": null, + "locked": false, + "text": "4", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "4", + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 30, + "versionNonce": 23309980, + "isDeleted": false, + "id": "PLt6Rdbq4tY50oVuXlUCL", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 596.352128982544, + "y": 209.5, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 13.759994506835938, + "height": 25, + "seed": 174513820, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1716818080878, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "0", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "0", + "lineHeight": 1.25, + "baseline": 18 + }, + { + "id": "pErl-vBtaHgFyQq0lAHak", + "type": "text", + "x": 634.7521228790283, + "y": 275, + "width": 5.4199981689453125, + "height": 25, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 324771748, + "version": 38, + "versionNonce": 658121884, + "isDeleted": false, + "boundElements": null, + "updated": 1716818088157, + "link": null, + "locked": false, + "text": "1", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "1", + "lineHeight": 1.25 + }, + { + "id": "qobnBa5o3iP8u_pzL3JSe", + "type": "text", + "x": 565.7521228790283, + "y": 322, + "width": 14.239990234375, + "height": 25, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1735004700, + "version": 40, + "versionNonce": 1332245156, + "isDeleted": false, + "boundElements": null, + "updated": 1716818069784, + "link": null, + "locked": false, + "text": "2", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "2", + "lineHeight": 1.25 + }, + { + "id": "ZLqW1_TQJGkK5faghZMhc", + "type": "text", + "x": 474.7521228790283, + "y": 271, + "width": 13.619979858398438, + "height": 25, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1321099300, + "version": 18, + "versionNonce": 10652836, + "isDeleted": false, + "boundElements": null, + "updated": 1716818090639, + "link": null, + "locked": false, + "text": "3", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "3", + "lineHeight": 1.25 + }, + { + "id": "21nPurypSEJ6jrU7oIjsK", + "type": "text", + "x": 517.7521228790283, + "y": 209, + "width": 12.79998779296875, + "height": 25, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 1373638180, + "version": 27, + "versionNonce": 359355676, + "isDeleted": false, + "boundElements": null, + "updated": 1716818096858, + "link": null, + "locked": false, + "text": "4", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "baseline": 18, + "containerId": null, + "originalText": "4", + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": null, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/assets/img/java-multithreading/dining-philosophers.png b/assets/img/java-multithreading/dining-philosophers.png new file mode 100644 index 0000000..a2a0960 Binary files /dev/null and b/assets/img/java-multithreading/dining-philosophers.png differ diff --git a/assets/img/java/generics-typecasting.png b/assets/img/java/generics-typecasting.png new file mode 100644 index 0000000..24c647c Binary files /dev/null and b/assets/img/java/generics-typecasting.png differ diff --git a/assets/img/java/inheritance.drawio b/assets/img/java/inheritance.drawio new file mode 100644 index 0000000..c3ae610 --- /dev/null +++ b/assets/img/java/inheritance.drawio @@ -0,0 +1,115 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/java/inheritance.drawio.png b/assets/img/java/inheritance.drawio.png new file mode 100644 index 0000000..0bdcf55 Binary files /dev/null and b/assets/img/java/inheritance.drawio.png differ diff --git a/assets/img/java/io-bound-architecture.drawio b/assets/img/java/io-bound-architecture.drawio new file mode 100644 index 0000000..35fc0b1 --- /dev/null +++ b/assets/img/java/io-bound-architecture.drawio @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/java/io-bound-architecture.drawio.png b/assets/img/java/io-bound-architecture.drawio.png new file mode 100644 index 0000000..05c9a47 Binary files /dev/null and b/assets/img/java/io-bound-architecture.drawio.png differ diff --git a/assets/img/java/java-main-output.png b/assets/img/java/java-main-output.png new file mode 100644 index 0000000..af0a54f Binary files /dev/null and b/assets/img/java/java-main-output.png differ diff --git a/assets/img/java/multithreading.drawio b/assets/img/java/multithreading.drawio new file mode 100644 index 0000000..016dcb6 --- /dev/null +++ b/assets/img/java/multithreading.drawio @@ -0,0 +1 @@ +zZhdT8IwFIZ/zS5NWLsNuISBeqFG5cLoXbNVtqRbsRQ3/PUW1+6DQoJR2Lkh7duuH0/7nh3m4DArbwRZJfc8psxBg7h08MxBaBSM1O9O2FaC7w4qYSnSuJLcRlikX1SLptsmjem601FyzmS66ooRz3MayY5GhOBFt9s7Z91ZV2RJLWEREWarL2ksE70tNGz0W5ouEzOzG4yrloyYznon64TEvGhJeO7gUHAuq1JWhpTt2Bku1XPXR1rrhQmay1Me+MwnaPb0chNPP4LXN6+cPbzmV3qUT8I2esN6sXJrCAi+yWO6G2Tg4GmRpJIuViTatRbqyJWWyIypmquK7yljIWdcqHrOc9VpqmegQtLy6NLdGoi6SJRnVIqt6mIeMAz1JfJ0tWhOxPO1lrROA421SPQtWNZDN6BUQbP6BTdkcQu5oEpx/8bvP2DhLizfhuUODsAanouVZ7G6g0fJHZ6ICZ8Lk4l4bU5/dOMZONWBucWp1i7DyY5ZADhhHxwnO0bd4d45WffpQHjy8EU5YYvT8+QeHqhx76ACC9Sj4BFdr9XrHhquut5bPEf2vQKTI9QZFJQcAYFMEvYx9Z8kIB+wBy1c/XvQDllgPLifMfTvwSFED+5jAuDBEWAPWrj69+AYrge9AJgHzSnA8uA+pv49iO0/gXA8aOE6owdVtfmE+NPW+g6L598= \ No newline at end of file diff --git a/assets/img/java/multithreading.drawio.png b/assets/img/java/multithreading.drawio.png new file mode 100644 index 0000000..c2a0487 Binary files /dev/null and b/assets/img/java/multithreading.drawio.png differ diff --git a/assets/img/java/phases-and-goals.png b/assets/img/java/phases-and-goals.png new file mode 100644 index 0000000..e8025bd Binary files /dev/null and b/assets/img/java/phases-and-goals.png differ diff --git a/assets/img/java/protected-caveat.png b/assets/img/java/protected-caveat.png new file mode 100644 index 0000000..71cc04c Binary files /dev/null and b/assets/img/java/protected-caveat.png differ diff --git a/assets/img/low-level-design/activity.drawio b/assets/img/low-level-design/activity.drawio new file mode 100644 index 0000000..dc39d22 --- /dev/null +++ b/assets/img/low-level-design/activity.drawio @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/low-level-design/activity.drawio.png b/assets/img/low-level-design/activity.drawio.png new file mode 100644 index 0000000..8fcbb30 Binary files /dev/null and b/assets/img/low-level-design/activity.drawio.png differ diff --git a/assets/img/low-level-design/interpreter-ast.drawio b/assets/img/low-level-design/interpreter-ast.drawio new file mode 100644 index 0000000..47381e7 --- /dev/null +++ b/assets/img/low-level-design/interpreter-ast.drawio @@ -0,0 +1,112 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/low-level-design/interpreter-ast.drawio.png b/assets/img/low-level-design/interpreter-ast.drawio.png new file mode 100644 index 0000000..31603c3 Binary files /dev/null and b/assets/img/low-level-design/interpreter-ast.drawio.png differ diff --git a/assets/img/low-level-design/use-case.drawio b/assets/img/low-level-design/use-case.drawio new file mode 100644 index 0000000..59d7f85 --- /dev/null +++ b/assets/img/low-level-design/use-case.drawio @@ -0,0 +1,79 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/low-level-design/use-case.drawio.png b/assets/img/low-level-design/use-case.drawio.png new file mode 100644 index 0000000..2579079 Binary files /dev/null and b/assets/img/low-level-design/use-case.drawio.png differ diff --git a/assets/img/messaging-systems/dead-letter-exchange.png b/assets/img/messaging-systems/dead-letter-exchange.png new file mode 100644 index 0000000..34771a1 Binary files /dev/null and b/assets/img/messaging-systems/dead-letter-exchange.png differ diff --git a/assets/img/messaging-systems/queue-processor-design.png b/assets/img/messaging-systems/queue-processor-design.png new file mode 100644 index 0000000..66a67ba Binary files /dev/null and b/assets/img/messaging-systems/queue-processor-design.png differ diff --git a/assets/img/messaging-systems/queue-router-design.png b/assets/img/messaging-systems/queue-router-design.png new file mode 100644 index 0000000..6825144 Binary files /dev/null and b/assets/img/messaging-systems/queue-router-design.png differ diff --git a/assets/img/python-basics/decorators.png b/assets/img/python-basics/decorators.png new file mode 100644 index 0000000..67ad99b Binary files /dev/null and b/assets/img/python-basics/decorators.png differ diff --git a/assets/img/python-basics/generators-iterators.png b/assets/img/python-basics/generators-iterators.png new file mode 100644 index 0000000..f96091d Binary files /dev/null and b/assets/img/python-basics/generators-iterators.png differ diff --git a/assets/img/python-data-analysis/bar-graph-with-custom-bottom.png b/assets/img/python-data-analysis/bar-graph-with-custom-bottom.png new file mode 100644 index 0000000..33a31fe Binary files /dev/null and b/assets/img/python-data-analysis/bar-graph-with-custom-bottom.png differ diff --git a/assets/img/python-data-analysis/bedroom_sort_index_example.png b/assets/img/python-data-analysis/bedroom_sort_index_example.png new file mode 100644 index 0000000..0e35266 Binary files /dev/null and b/assets/img/python-data-analysis/bedroom_sort_index_example.png differ diff --git a/assets/img/python-data-analysis/blinding_lights_chart_performance.png b/assets/img/python-data-analysis/blinding_lights_chart_performance.png new file mode 100644 index 0000000..0a661af Binary files /dev/null and b/assets/img/python-data-analysis/blinding_lights_chart_performance.png differ diff --git a/assets/img/python-data-analysis/christmas-songs-project.png b/assets/img/python-data-analysis/christmas-songs-project.png new file mode 100644 index 0000000..4a306b5 Binary files /dev/null and b/assets/img/python-data-analysis/christmas-songs-project.png differ diff --git a/assets/img/python-data-analysis/configured-box-plot.png b/assets/img/python-data-analysis/configured-box-plot.png new file mode 100644 index 0000000..8c79f45 Binary files /dev/null and b/assets/img/python-data-analysis/configured-box-plot.png differ diff --git a/assets/img/python-data-analysis/custom-subplots-using-pandas.png b/assets/img/python-data-analysis/custom-subplots-using-pandas.png new file mode 100644 index 0000000..a57c169 Binary files /dev/null and b/assets/img/python-data-analysis/custom-subplots-using-pandas.png differ diff --git a/assets/img/python-data-analysis/custom-tick-limits.png b/assets/img/python-data-analysis/custom-tick-limits.png new file mode 100644 index 0000000..fc910f5 Binary files /dev/null and b/assets/img/python-data-analysis/custom-tick-limits.png differ diff --git a/assets/img/python-data-analysis/custom-ticks.png b/assets/img/python-data-analysis/custom-ticks.png new file mode 100644 index 0000000..3a3f632 Binary files /dev/null and b/assets/img/python-data-analysis/custom-ticks.png differ diff --git a/assets/img/python-data-analysis/dataframe-default-without-subplots.png b/assets/img/python-data-analysis/dataframe-default-without-subplots.png new file mode 100644 index 0000000..a847b79 Binary files /dev/null and b/assets/img/python-data-analysis/dataframe-default-without-subplots.png differ diff --git a/assets/img/python-data-analysis/dataframe-with-subplots-configured.png b/assets/img/python-data-analysis/dataframe-with-subplots-configured.png new file mode 100644 index 0000000..8962d43 Binary files /dev/null and b/assets/img/python-data-analysis/dataframe-with-subplots-configured.png differ diff --git a/assets/img/python-data-analysis/dataframe-with-subplots.png b/assets/img/python-data-analysis/dataframe-with-subplots.png new file mode 100644 index 0000000..8eb2f8e Binary files /dev/null and b/assets/img/python-data-analysis/dataframe-with-subplots.png differ diff --git a/assets/img/python-data-analysis/default-box-plot.png b/assets/img/python-data-analysis/default-box-plot.png new file mode 100644 index 0000000..6ac876b Binary files /dev/null and b/assets/img/python-data-analysis/default-box-plot.png differ diff --git a/assets/img/python-data-analysis/default-histogram-without-binning.png b/assets/img/python-data-analysis/default-histogram-without-binning.png new file mode 100644 index 0000000..4fcdf32 Binary files /dev/null and b/assets/img/python-data-analysis/default-histogram-without-binning.png differ diff --git a/assets/img/python-data-analysis/default-without-ticks.png b/assets/img/python-data-analysis/default-without-ticks.png new file mode 100644 index 0000000..bed39d0 Binary files /dev/null and b/assets/img/python-data-analysis/default-without-ticks.png differ diff --git a/assets/img/python-data-analysis/different-figure-i.png b/assets/img/python-data-analysis/different-figure-i.png new file mode 100644 index 0000000..0b5962f Binary files /dev/null and b/assets/img/python-data-analysis/different-figure-i.png differ diff --git a/assets/img/python-data-analysis/different-figure-ii.png b/assets/img/python-data-analysis/different-figure-ii.png new file mode 100644 index 0000000..fc205da Binary files /dev/null and b/assets/img/python-data-analysis/different-figure-ii.png differ diff --git a/assets/img/python-data-analysis/different-figure-iii.png b/assets/img/python-data-analysis/different-figure-iii.png new file mode 100644 index 0000000..abae785 Binary files /dev/null and b/assets/img/python-data-analysis/different-figure-iii.png differ diff --git a/assets/img/python-data-analysis/final-matplotlib-example.png b/assets/img/python-data-analysis/final-matplotlib-example.png new file mode 100644 index 0000000..c86472c Binary files /dev/null and b/assets/img/python-data-analysis/final-matplotlib-example.png differ diff --git a/assets/img/python-data-analysis/grouping-and-aggregations-project.png b/assets/img/python-data-analysis/grouping-and-aggregations-project.png new file mode 100644 index 0000000..593a4db Binary files /dev/null and b/assets/img/python-data-analysis/grouping-and-aggregations-project.png differ diff --git a/assets/img/python-data-analysis/histogram-basic-project.png b/assets/img/python-data-analysis/histogram-basic-project.png new file mode 100644 index 0000000..6b431c5 Binary files /dev/null and b/assets/img/python-data-analysis/histogram-basic-project.png differ diff --git a/assets/img/python-data-analysis/histogram-using-different-axes.png b/assets/img/python-data-analysis/histogram-using-different-axes.png new file mode 100644 index 0000000..fa6ea00 Binary files /dev/null and b/assets/img/python-data-analysis/histogram-using-different-axes.png differ diff --git a/assets/img/python-data-analysis/histogram-using-same-axes.png b/assets/img/python-data-analysis/histogram-using-same-axes.png new file mode 100644 index 0000000..a847b79 Binary files /dev/null and b/assets/img/python-data-analysis/histogram-using-same-axes.png differ diff --git a/assets/img/python-data-analysis/histogram-with-custom-binning.png b/assets/img/python-data-analysis/histogram-with-custom-binning.png new file mode 100644 index 0000000..3ee444c Binary files /dev/null and b/assets/img/python-data-analysis/histogram-with-custom-binning.png differ diff --git a/assets/img/python-data-analysis/houses-bedrooms-vs-bathrooms-scatter-plot.png b/assets/img/python-data-analysis/houses-bedrooms-vs-bathrooms-scatter-plot.png new file mode 100644 index 0000000..f3528b6 Binary files /dev/null and b/assets/img/python-data-analysis/houses-bedrooms-vs-bathrooms-scatter-plot.png differ diff --git a/assets/img/python-data-analysis/houses-sold-by-week-datetime-line.png b/assets/img/python-data-analysis/houses-sold-by-week-datetime-line.png new file mode 100644 index 0000000..18fc1fd Binary files /dev/null and b/assets/img/python-data-analysis/houses-sold-by-week-datetime-line.png differ diff --git a/assets/img/python-data-analysis/houses-sold-in-period-monthwise.png b/assets/img/python-data-analysis/houses-sold-in-period-monthwise.png new file mode 100644 index 0000000..30b1c8f Binary files /dev/null and b/assets/img/python-data-analysis/houses-sold-in-period-monthwise.png differ diff --git a/assets/img/python-data-analysis/matplotlib-basics-specifying-both-axes.png b/assets/img/python-data-analysis/matplotlib-basics-specifying-both-axes.png new file mode 100644 index 0000000..cfc5e9a Binary files /dev/null and b/assets/img/python-data-analysis/matplotlib-basics-specifying-both-axes.png differ diff --git a/assets/img/python-data-analysis/matplotlib-very-basic.png b/assets/img/python-data-analysis/matplotlib-very-basic.png new file mode 100644 index 0000000..2a0dfeb Binary files /dev/null and b/assets/img/python-data-analysis/matplotlib-very-basic.png differ diff --git a/assets/img/python-data-analysis/multiple-bar-graph-alignment-side-by-side.png b/assets/img/python-data-analysis/multiple-bar-graph-alignment-side-by-side.png new file mode 100644 index 0000000..a6e688c Binary files /dev/null and b/assets/img/python-data-analysis/multiple-bar-graph-alignment-side-by-side.png differ diff --git a/assets/img/python-data-analysis/pandas-dataframe-example.png b/assets/img/python-data-analysis/pandas-dataframe-example.png new file mode 100644 index 0000000..01e5922 Binary files /dev/null and b/assets/img/python-data-analysis/pandas-dataframe-example.png differ diff --git a/assets/img/python-data-analysis/pandas-salaries-default-side-by-side.png b/assets/img/python-data-analysis/pandas-salaries-default-side-by-side.png new file mode 100644 index 0000000..fd8141d Binary files /dev/null and b/assets/img/python-data-analysis/pandas-salaries-default-side-by-side.png differ diff --git a/assets/img/python-data-analysis/pandas-salaries-stacked.png b/assets/img/python-data-analysis/pandas-salaries-stacked.png new file mode 100644 index 0000000..302bbb9 Binary files /dev/null and b/assets/img/python-data-analysis/pandas-salaries-stacked.png differ diff --git a/assets/img/python-data-analysis/pandas-series-example.png b/assets/img/python-data-analysis/pandas-series-example.png new file mode 100644 index 0000000..51f3269 Binary files /dev/null and b/assets/img/python-data-analysis/pandas-series-example.png differ diff --git a/assets/img/python-data-analysis/pie-chart-example.png b/assets/img/python-data-analysis/pie-chart-example.png new file mode 100644 index 0000000..865e17c Binary files /dev/null and b/assets/img/python-data-analysis/pie-chart-example.png differ diff --git a/assets/img/python-data-analysis/plot-line-styles.png b/assets/img/python-data-analysis/plot-line-styles.png new file mode 100644 index 0000000..40e33cf Binary files /dev/null and b/assets/img/python-data-analysis/plot-line-styles.png differ diff --git a/assets/img/python-data-analysis/plot-with-markers.png b/assets/img/python-data-analysis/plot-with-markers.png new file mode 100644 index 0000000..f04370a Binary files /dev/null and b/assets/img/python-data-analysis/plot-with-markers.png differ diff --git a/assets/img/python-data-analysis/plot-with-title.png b/assets/img/python-data-analysis/plot-with-title.png new file mode 100644 index 0000000..a2a4c99 Binary files /dev/null and b/assets/img/python-data-analysis/plot-with-title.png differ diff --git a/assets/img/python-data-analysis/plotting-bar-basics.png b/assets/img/python-data-analysis/plotting-bar-basics.png new file mode 100644 index 0000000..1898093 Binary files /dev/null and b/assets/img/python-data-analysis/plotting-bar-basics.png differ diff --git a/assets/img/python-data-analysis/plotting-basics-bedroom-value-counts.png b/assets/img/python-data-analysis/plotting-basics-bedroom-value-counts.png new file mode 100644 index 0000000..8c981a7 Binary files /dev/null and b/assets/img/python-data-analysis/plotting-basics-bedroom-value-counts.png differ diff --git a/assets/img/python-data-analysis/plotting-hierarchical-index-with-custom-unstack.png b/assets/img/python-data-analysis/plotting-hierarchical-index-with-custom-unstack.png new file mode 100644 index 0000000..226464b Binary files /dev/null and b/assets/img/python-data-analysis/plotting-hierarchical-index-with-custom-unstack.png differ diff --git a/assets/img/python-data-analysis/plotting-hierarchical-index-with-unstack.png b/assets/img/python-data-analysis/plotting-hierarchical-index-with-unstack.png new file mode 100644 index 0000000..4d82cf0 Binary files /dev/null and b/assets/img/python-data-analysis/plotting-hierarchical-index-with-unstack.png differ diff --git a/assets/img/python-data-analysis/plotting-hierarchical-index-without-unstack.png b/assets/img/python-data-analysis/plotting-hierarchical-index-without-unstack.png new file mode 100644 index 0000000..a107765 Binary files /dev/null and b/assets/img/python-data-analysis/plotting-hierarchical-index-without-unstack.png differ diff --git a/assets/img/python-data-analysis/same-figure-same-axes-with-legend.png b/assets/img/python-data-analysis/same-figure-same-axes-with-legend.png new file mode 100644 index 0000000..c2275f2 Binary files /dev/null and b/assets/img/python-data-analysis/same-figure-same-axes-with-legend.png differ diff --git a/assets/img/python-data-analysis/same-figure-same-axes.png b/assets/img/python-data-analysis/same-figure-same-axes.png new file mode 100644 index 0000000..244bfa3 Binary files /dev/null and b/assets/img/python-data-analysis/same-figure-same-axes.png differ diff --git a/assets/img/python-data-analysis/scatter-plot-bedrooms-vs-bathrooms.png b/assets/img/python-data-analysis/scatter-plot-bedrooms-vs-bathrooms.png new file mode 100644 index 0000000..454b7db Binary files /dev/null and b/assets/img/python-data-analysis/scatter-plot-bedrooms-vs-bathrooms.png differ diff --git a/assets/img/python-data-analysis/seaborn-barplot-simulation-using-matplotlib.png b/assets/img/python-data-analysis/seaborn-barplot-simulation-using-matplotlib.png new file mode 100644 index 0000000..de84026 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-barplot-simulation-using-matplotlib.png differ diff --git a/assets/img/python-data-analysis/seaborn-barplot.png b/assets/img/python-data-analysis/seaborn-barplot.png new file mode 100644 index 0000000..7dfef31 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-barplot.png differ diff --git a/assets/img/python-data-analysis/seaborn-bivariate-histogram-plot.png b/assets/img/python-data-analysis/seaborn-bivariate-histogram-plot.png new file mode 100644 index 0000000..4228c3f Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-bivariate-histogram-plot.png differ diff --git a/assets/img/python-data-analysis/seaborn-bivariate-kde-plot.png b/assets/img/python-data-analysis/seaborn-bivariate-kde-plot.png new file mode 100644 index 0000000..ca0e476 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-bivariate-kde-plot.png differ diff --git a/assets/img/python-data-analysis/seaborn-boxplot-and-swarmplot.png b/assets/img/python-data-analysis/seaborn-boxplot-and-swarmplot.png new file mode 100644 index 0000000..da1a8d6 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-boxplot-and-swarmplot.png differ diff --git a/assets/img/python-data-analysis/seaborn-boxplot-for-catgeories.png b/assets/img/python-data-analysis/seaborn-boxplot-for-catgeories.png new file mode 100644 index 0000000..d7200ef Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-boxplot-for-catgeories.png differ diff --git a/assets/img/python-data-analysis/seaborn-boxplot.png b/assets/img/python-data-analysis/seaborn-boxplot.png new file mode 100644 index 0000000..1b963b1 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-boxplot.png differ diff --git a/assets/img/python-data-analysis/seaborn-countplot-introduction.png b/assets/img/python-data-analysis/seaborn-countplot-introduction.png new file mode 100644 index 0000000..f07caf0 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-countplot-introduction.png differ diff --git a/assets/img/python-data-analysis/seaborn-countplot-simulation-using-matplotlib.png b/assets/img/python-data-analysis/seaborn-countplot-simulation-using-matplotlib.png new file mode 100644 index 0000000..9874dc2 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-countplot-simulation-using-matplotlib.png differ diff --git a/assets/img/python-data-analysis/seaborn-displot-example.png b/assets/img/python-data-analysis/seaborn-displot-example.png new file mode 100644 index 0000000..fee11de Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-displot-example.png differ diff --git a/assets/img/python-data-analysis/seaborn-getting-started.png b/assets/img/python-data-analysis/seaborn-getting-started.png new file mode 100644 index 0000000..59a3aef Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-getting-started.png differ diff --git a/assets/img/python-data-analysis/seaborn-histogram-introduction.png b/assets/img/python-data-analysis/seaborn-histogram-introduction.png new file mode 100644 index 0000000..3ebf5d4 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-histogram-introduction.png differ diff --git a/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple-dodge.png b/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple-dodge.png new file mode 100644 index 0000000..b265328 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple-dodge.png differ diff --git a/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple.png b/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple.png new file mode 100644 index 0000000..a886ab6 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-histogram-with-hue-and-multiple.png differ diff --git a/assets/img/python-data-analysis/seaborn-histogram-with-hue.png b/assets/img/python-data-analysis/seaborn-histogram-with-hue.png new file mode 100644 index 0000000..c8676fc Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-histogram-with-hue.png differ diff --git a/assets/img/python-data-analysis/seaborn-histogram-with-kde.png b/assets/img/python-data-analysis/seaborn-histogram-with-kde.png new file mode 100644 index 0000000..72cf23f Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-histogram-with-kde.png differ diff --git a/assets/img/python-data-analysis/seaborn-kde-introduction.png b/assets/img/python-data-analysis/seaborn-kde-introduction.png new file mode 100644 index 0000000..bcfb250 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-kde-introduction.png differ diff --git a/assets/img/python-data-analysis/seaborn-kde-with-bandwidth-adjustment.png b/assets/img/python-data-analysis/seaborn-kde-with-bandwidth-adjustment.png new file mode 100644 index 0000000..56d567f Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-kde-with-bandwidth-adjustment.png differ diff --git a/assets/img/python-data-analysis/seaborn-line-plot-default-with-estimator.png b/assets/img/python-data-analysis/seaborn-line-plot-default-with-estimator.png new file mode 100644 index 0000000..ff106e0 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-line-plot-default-with-estimator.png differ diff --git a/assets/img/python-data-analysis/seaborn-line-plot-default.png b/assets/img/python-data-analysis/seaborn-line-plot-default.png new file mode 100644 index 0000000..1192db0 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-line-plot-default.png differ diff --git a/assets/img/python-data-analysis/seaborn-line-plot-matplotlib-equivalent.png b/assets/img/python-data-analysis/seaborn-line-plot-matplotlib-equivalent.png new file mode 100644 index 0000000..8ed4618 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-line-plot-matplotlib-equivalent.png differ diff --git a/assets/img/python-data-analysis/seaborn-relplot-introduction.png b/assets/img/python-data-analysis/seaborn-relplot-introduction.png new file mode 100644 index 0000000..2b542e9 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-relplot-introduction.png differ diff --git a/assets/img/python-data-analysis/seaborn-relplot-involved-example.png b/assets/img/python-data-analysis/seaborn-relplot-involved-example.png new file mode 100644 index 0000000..0ecaa20 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-relplot-involved-example.png differ diff --git a/assets/img/python-data-analysis/seaborn-rugplot-basics.png b/assets/img/python-data-analysis/seaborn-rugplot-basics.png new file mode 100644 index 0000000..5deeb5c Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-rugplot-basics.png differ diff --git a/assets/img/python-data-analysis/seaborn-rugplot-supplementing-scatterplot.png b/assets/img/python-data-analysis/seaborn-rugplot-supplementing-scatterplot.png new file mode 100644 index 0000000..e85e049 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-rugplot-supplementing-scatterplot.png differ diff --git a/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue-and-style.png b/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue-and-style.png new file mode 100644 index 0000000..d69e367 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue-and-style.png differ diff --git a/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue.png b/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue.png new file mode 100644 index 0000000..e3b41f0 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-scatter-plot-with-hue.png differ diff --git a/assets/img/python-data-analysis/seaborn-scatter-plot-with-same-column-for-hue-and-style.png b/assets/img/python-data-analysis/seaborn-scatter-plot-with-same-column-for-hue-and-style.png new file mode 100644 index 0000000..706162b Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-scatter-plot-with-same-column-for-hue-and-style.png differ diff --git a/assets/img/python-data-analysis/seaborn-scatter-plot-with-size.png b/assets/img/python-data-analysis/seaborn-scatter-plot-with-size.png new file mode 100644 index 0000000..792181c Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-scatter-plot-with-size.png differ diff --git a/assets/img/python-data-analysis/seaborn-scatterplot-for-categorical-data.png b/assets/img/python-data-analysis/seaborn-scatterplot-for-categorical-data.png new file mode 100644 index 0000000..c0b739d Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-scatterplot-for-categorical-data.png differ diff --git a/assets/img/python-data-analysis/seaborn-stripplot-example.png b/assets/img/python-data-analysis/seaborn-stripplot-example.png new file mode 100644 index 0000000..8508c47 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-stripplot-example.png differ diff --git a/assets/img/python-data-analysis/seaborn-swarmplot-example.png b/assets/img/python-data-analysis/seaborn-swarmplot-example.png new file mode 100644 index 0000000..95adbdc Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-swarmplot-example.png differ diff --git a/assets/img/python-data-analysis/seaborn-violinplot-introduction.png b/assets/img/python-data-analysis/seaborn-violinplot-introduction.png new file mode 100644 index 0000000..e32adfe Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-violinplot-introduction.png differ diff --git a/assets/img/python-data-analysis/seaborn-violinplot-with-hue-and-split.png b/assets/img/python-data-analysis/seaborn-violinplot-with-hue-and-split.png new file mode 100644 index 0000000..9ee322a Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-violinplot-with-hue-and-split.png differ diff --git a/assets/img/python-data-analysis/seaborn-violinplot-with-hue-without-split.png b/assets/img/python-data-analysis/seaborn-violinplot-with-hue-without-split.png new file mode 100644 index 0000000..5d3d1b2 Binary files /dev/null and b/assets/img/python-data-analysis/seaborn-violinplot-with-hue-without-split.png differ diff --git a/assets/img/python-data-analysis/stacked-bar-graph.png b/assets/img/python-data-analysis/stacked-bar-graph.png new file mode 100644 index 0000000..2146049 Binary files /dev/null and b/assets/img/python-data-analysis/stacked-bar-graph.png differ diff --git a/assets/img/python-data-analysis/stacked-horizontal-bar-graph.png b/assets/img/python-data-analysis/stacked-horizontal-bar-graph.png new file mode 100644 index 0000000..9f7d9d9 Binary files /dev/null and b/assets/img/python-data-analysis/stacked-horizontal-bar-graph.png differ diff --git a/assets/img/python-data-analysis/subplots-example.png b/assets/img/python-data-analysis/subplots-example.png new file mode 100644 index 0000000..1998003 Binary files /dev/null and b/assets/img/python-data-analysis/subplots-example.png differ diff --git a/assets/img/python-data-analysis/titanic-subplot-with-shared-axes.png b/assets/img/python-data-analysis/titanic-subplot-with-shared-axes.png new file mode 100644 index 0000000..b55034e Binary files /dev/null and b/assets/img/python-data-analysis/titanic-subplot-with-shared-axes.png differ diff --git a/assets/img/python-data-analysis/titanic-subplot-without-shared-axes.png b/assets/img/python-data-analysis/titanic-subplot-without-shared-axes.png new file mode 100644 index 0000000..6ed17e0 Binary files /dev/null and b/assets/img/python-data-analysis/titanic-subplot-without-shared-axes.png differ diff --git a/assets/img/python-data-analysis/ufo-sightings-by-month-abbrev-month-labels.png b/assets/img/python-data-analysis/ufo-sightings-by-month-abbrev-month-labels.png new file mode 100644 index 0000000..948ea0a Binary files /dev/null and b/assets/img/python-data-analysis/ufo-sightings-by-month-abbrev-month-labels.png differ diff --git a/assets/img/python-data-analysis/ufo-sightings-by-month-numeric-month-labels.png b/assets/img/python-data-analysis/ufo-sightings-by-month-numeric-month-labels.png new file mode 100644 index 0000000..ad539b9 Binary files /dev/null and b/assets/img/python-data-analysis/ufo-sightings-by-month-numeric-month-labels.png differ diff --git a/assets/img/python-data-analysis/ufo-value-counts-by-shape.png b/assets/img/python-data-analysis/ufo-value-counts-by-shape.png new file mode 100644 index 0000000..3e8e0b8 Binary files /dev/null and b/assets/img/python-data-analysis/ufo-value-counts-by-shape.png differ diff --git a/assets/img/python-data-analysis/value-counts-ufo-sightings-by-year-datetime.png b/assets/img/python-data-analysis/value-counts-ufo-sightings-by-year-datetime.png new file mode 100644 index 0000000..96f2fb7 Binary files /dev/null and b/assets/img/python-data-analysis/value-counts-ufo-sightings-by-year-datetime.png differ diff --git a/assets/img/relational-databases/er-diagram-example b/assets/img/relational-databases/er-diagram-example new file mode 100644 index 0000000..fe4933d --- /dev/null +++ b/assets/img/relational-databases/er-diagram-example @@ -0,0 +1 @@ +7Vtdc5s4FP01fmwGJMDw2DrdbWcnO53JzG7zqBjFMAHkEXJt769fYb6v7IApILeTlwRdCwFHR7rn3gsLvIoPf3KyDR6YT6MFMvzDAt8vEFoiLP9mhmNusLCRGzY89HOTWRsew/9oYSy77UKfpq2OgrFIhNu2cc2ShK5Fy0Y4Z/t2txcWta+6JRuqGB7XJFKt/4a+CHKri5a1/QsNN0F5ZdPx8l9iUnYuniQNiM/2DRP+vMArzpjIj+LDikYZdiUu+Xl/XPi1ujFOE9HnhFX8bL9uv358iP8yAsz/eX1h+w/FKD9ItCseOBU7Pxsxv2dxLIHgbJf4NBvLWOBP+yAU9HFL1tmveznz0haIOJItUx4Wo1Iu6OHi7ZoVCJI8lMVU8KPsUp7gFLgVxHGL5r4xC4UpaEyAVdhIMe+bauAaGnlQoHMFUkhBaoGcSF71004ebLIDeX5hkleorAqS8mKSvXQWFK02iEgziFgBUa5bQeS6RUbC7tJbwko34SwFq4TE9JYQMi3NENln1uRQeF7CKFqxiHHZTlhCx0EM2y3ElrYCmHcGMHsqwJwegCX+x8xv1jA0QGo7AXoIxffil+z4KbPf2UXr/tDodn8sG4l8ju/NRuOsrFmfdmqV5+X3SX3FW4OZkM/CdnxNu3dyQfiGii7fqM5sY+7sN+aO04iI8Ef7ds9NaHGFbyzMvO4l9+cASuRPWZzU9PpgnEpIlWsWjJOjoIxzYlf10MMJt3wnXLWT3zbhkAGkAh7KOBswzp2Xcu475SrPeOOUg0zxBlLOXIKBoAOdmHKeQrktZxtOYoV5c0dRlntborZ0P79YGOWAzVG38DXVuF17cAAxMnUHm6Yasq/lhpKqMM29KjECvlb7slQD82wVamQTAvoXO7ohusF4HLu3BlKfiPy3F1+FHOgWX8b52Z1HfdlwExqqvmygvhR6Tay+zPesxjWk0yr5HQ9wZTmQdEuY2UYzk26azEZNtKcmzzpIV/PsqSTuPKQrdUMn63J0tCXT7JG2OgxVycyBpjlycuMCg4wOBp1nq/kmW0dkXfnQ3XudpZN1nWTpyzoHeGpszMw6Nb9xKtuFiVqw4wGLn3fpdfUWn76QXVRH+T9ZpAJxaE9RDNOU41WOz+U8xoBtxDKVh+7alapqy2yWqmwVNRNycTzY1PBdhe2awH0i6Mw2cNprfEiN5cfxEddojCF6RoeHaLyJo8NDwJrL0CofhgJn5jIfUpMj5VSNKkz60s7UQjqjJ+k8nZyzIVWG1vkcw7lDwGnMXOpDgyJ/n6TBiWnmWxRMBWevVFUocrziLUV06kW4aLT70klbyc1rzxh2nYE7jtEx0IWpl3NBjo1u26xDOgk51FhJkFcqr2XstqqS0K5blberdOtWfE63OiTOAEme0+zf31Ns8n2jz+YmP1/s6fZVFnozHkBZYCgIepfWobuAEmXiLR6rlc4HncrCWOhQFn1Jp/eVNVhQGJrbha9zzJ3bxWrkSRPOshNvzmvAUrJ2pzFH9NldWBlSxBlxwZZfqnStV6zVSSjvXw1NUMI4VkkKTb1g1bLzWMrEwE6beZbldnEva32jPJSPRfnV5Lp10gCZ6g0MIOALHjMXjfG50HFMcVsnwBzc2IQ+SEoZztt1PdmA9GkwErlem5HLpTctI3+XdJtljiRQqo8YdYlitfg8Urrtl/KyuC/ttCbcMCQLdI69vazb4a4Hs0426+9Q8+71x7z48/8= \ No newline at end of file diff --git a/assets/img/relational-databases/er-diagram-example.drawio.png b/assets/img/relational-databases/er-diagram-example.drawio.png new file mode 100644 index 0000000..2836ade Binary files /dev/null and b/assets/img/relational-databases/er-diagram-example.drawio.png differ diff --git a/assets/img/spark/aqe-shuffle-partitions.drawio b/assets/img/spark/aqe-shuffle-partitions.drawio new file mode 100644 index 0000000..42663cc --- /dev/null +++ b/assets/img/spark/aqe-shuffle-partitions.drawio @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/aqe-shuffle-partitions.drawio.png b/assets/img/spark/aqe-shuffle-partitions.drawio.png new file mode 100644 index 0000000..6dd1ac7 Binary files /dev/null and b/assets/img/spark/aqe-shuffle-partitions.drawio.png differ diff --git a/assets/img/spark/aqe-skew-joins.drawio b/assets/img/spark/aqe-skew-joins.drawio new file mode 100644 index 0000000..a3dfc59 --- /dev/null +++ b/assets/img/spark/aqe-skew-joins.drawio @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/aqe-skew-joins.drawio.png b/assets/img/spark/aqe-skew-joins.drawio.png new file mode 100644 index 0000000..6043c9a Binary files /dev/null and b/assets/img/spark/aqe-skew-joins.drawio.png differ diff --git a/assets/img/spark/broadcast-join-working.drawio b/assets/img/spark/broadcast-join-working.drawio new file mode 100644 index 0000000..12944ae --- /dev/null +++ b/assets/img/spark/broadcast-join-working.drawio @@ -0,0 +1,280 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/broadcast-join-working.drawio.png b/assets/img/spark/broadcast-join-working.drawio.png new file mode 100644 index 0000000..b319f41 Binary files /dev/null and b/assets/img/spark/broadcast-join-working.drawio.png differ diff --git a/assets/img/spark/bucket-by-output.png b/assets/img/spark/bucket-by-output.png new file mode 100644 index 0000000..f0bdb82 Binary files /dev/null and b/assets/img/spark/bucket-by-output.png differ diff --git a/assets/img/spark/execution-plan.jpg b/assets/img/spark/execution-plan.jpg new file mode 100644 index 0000000..3b846e9 Binary files /dev/null and b/assets/img/spark/execution-plan.jpg differ diff --git a/assets/img/spark/job-stages-tasks.drawio b/assets/img/spark/job-stages-tasks.drawio new file mode 100644 index 0000000..bf9504f --- /dev/null +++ b/assets/img/spark/job-stages-tasks.drawio @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/job-stages-tasks.drawio.png b/assets/img/spark/job-stages-tasks.drawio.png new file mode 100644 index 0000000..f7235a8 Binary files /dev/null and b/assets/img/spark/job-stages-tasks.drawio.png differ diff --git a/assets/img/spark/list-databases.png b/assets/img/spark/list-databases.png new file mode 100644 index 0000000..c853e2f Binary files /dev/null and b/assets/img/spark/list-databases.png differ diff --git a/assets/img/spark/partition-by-output.png b/assets/img/spark/partition-by-output.png new file mode 100644 index 0000000..ed03f76 Binary files /dev/null and b/assets/img/spark/partition-by-output.png differ diff --git a/assets/img/spark/repartition-output.png b/assets/img/spark/repartition-output.png new file mode 100644 index 0000000..d71d5a4 Binary files /dev/null and b/assets/img/spark/repartition-output.png differ diff --git a/assets/img/spark/shuffle-join-working.drawio b/assets/img/spark/shuffle-join-working.drawio new file mode 100644 index 0000000..7b6647b --- /dev/null +++ b/assets/img/spark/shuffle-join-working.drawio @@ -0,0 +1,310 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/shuffle-join-working.drawio.png b/assets/img/spark/shuffle-join-working.drawio.png new file mode 100644 index 0000000..f954263 Binary files /dev/null and b/assets/img/spark/shuffle-join-working.drawio.png differ diff --git a/assets/img/spark/simple-output.png b/assets/img/spark/simple-output.png new file mode 100644 index 0000000..1a80e8f Binary files /dev/null and b/assets/img/spark/simple-output.png differ diff --git a/assets/img/spark/spark-architecture.drawio b/assets/img/spark/spark-architecture.drawio new file mode 100644 index 0000000..8e457c2 --- /dev/null +++ b/assets/img/spark/spark-architecture.drawio @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/spark-architecture.drawio.png b/assets/img/spark/spark-architecture.drawio.png new file mode 100644 index 0000000..a61577f Binary files /dev/null and b/assets/img/spark/spark-architecture.drawio.png differ diff --git a/assets/img/spark/spark-ecosystem.drawio b/assets/img/spark/spark-ecosystem.drawio new file mode 100644 index 0000000..0e881a4 --- /dev/null +++ b/assets/img/spark/spark-ecosystem.drawio @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spark/spark-ecosystem.drawio.png b/assets/img/spark/spark-ecosystem.drawio.png new file mode 100644 index 0000000..0b4758a Binary files /dev/null and b/assets/img/spark/spark-ecosystem.drawio.png differ diff --git a/assets/img/spark/spark-streaming-jobs.png b/assets/img/spark/spark-streaming-jobs.png new file mode 100644 index 0000000..ed8d698 Binary files /dev/null and b/assets/img/spark/spark-streaming-jobs.png differ diff --git a/assets/img/spark/spark-to-java-types.png b/assets/img/spark/spark-to-java-types.png new file mode 100644 index 0000000..283ed14 Binary files /dev/null and b/assets/img/spark/spark-to-java-types.png differ diff --git a/assets/img/spark/streaming-input.png b/assets/img/spark/streaming-input.png new file mode 100644 index 0000000..96c5f29 Binary files /dev/null and b/assets/img/spark/streaming-input.png differ diff --git a/assets/img/spark/streaming-output-complete.png b/assets/img/spark/streaming-output-complete.png new file mode 100644 index 0000000..45b1d53 Binary files /dev/null and b/assets/img/spark/streaming-output-complete.png differ diff --git a/assets/img/spark/streaming-output-update.png b/assets/img/spark/streaming-output-update.png new file mode 100644 index 0000000..d0fbd66 Binary files /dev/null and b/assets/img/spark/streaming-output-update.png differ diff --git a/assets/img/spring-reactive/schedulers.drawio b/assets/img/spring-reactive/schedulers.drawio new file mode 100644 index 0000000..aaf0646 --- /dev/null +++ b/assets/img/spring-reactive/schedulers.drawio @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spring-reactive/schedulers.drawio.png b/assets/img/spring-reactive/schedulers.drawio.png new file mode 100644 index 0000000..c0b9169 Binary files /dev/null and b/assets/img/spring-reactive/schedulers.drawio.png differ diff --git a/assets/img/spring/envers.png b/assets/img/spring/envers.png new file mode 100644 index 0000000..3987569 Binary files /dev/null and b/assets/img/spring/envers.png differ diff --git a/assets/img/spring/join-table-one-to-many.png b/assets/img/spring/join-table-one-to-many.png new file mode 100644 index 0000000..c0f1641 Binary files /dev/null and b/assets/img/spring/join-table-one-to-many.png differ diff --git a/assets/img/spring/joined.png b/assets/img/spring/joined.png new file mode 100644 index 0000000..111df20 Binary files /dev/null and b/assets/img/spring/joined.png differ diff --git a/assets/img/spring/many-to-many-with-entity.png b/assets/img/spring/many-to-many-with-entity.png new file mode 100644 index 0000000..691afbc Binary files /dev/null and b/assets/img/spring/many-to-many-with-entity.png differ diff --git a/assets/img/spring/many-to-many.png b/assets/img/spring/many-to-many.png new file mode 100644 index 0000000..02bff3a Binary files /dev/null and b/assets/img/spring/many-to-many.png differ diff --git a/assets/img/spring/mapped-superclass.png b/assets/img/spring/mapped-superclass.png new file mode 100644 index 0000000..04facbf Binary files /dev/null and b/assets/img/spring/mapped-superclass.png differ diff --git a/assets/img/spring/order-column.png b/assets/img/spring/order-column.png new file mode 100644 index 0000000..6eca471 Binary files /dev/null and b/assets/img/spring/order-column.png differ diff --git a/assets/img/spring/secondary-table.png b/assets/img/spring/secondary-table.png new file mode 100644 index 0000000..4fbb3b6 Binary files /dev/null and b/assets/img/spring/secondary-table.png differ diff --git a/assets/img/spring/single-table.png b/assets/img/spring/single-table.png new file mode 100644 index 0000000..549b542 Binary files /dev/null and b/assets/img/spring/single-table.png differ diff --git a/assets/img/spring/spring-security-architecture.drawio b/assets/img/spring/spring-security-architecture.drawio new file mode 100644 index 0000000..71f2c87 --- /dev/null +++ b/assets/img/spring/spring-security-architecture.drawio @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spring/spring-security-architecture.drawio.png b/assets/img/spring/spring-security-architecture.drawio.png new file mode 100644 index 0000000..61c7cf8 Binary files /dev/null and b/assets/img/spring/spring-security-architecture.drawio.png differ diff --git a/assets/img/spring/webmvc-architecture.drawio b/assets/img/spring/webmvc-architecture.drawio new file mode 100644 index 0000000..cd714ee --- /dev/null +++ b/assets/img/spring/webmvc-architecture.drawio @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/spring/webmvc-architecture.drawio.png b/assets/img/spring/webmvc-architecture.drawio.png new file mode 100644 index 0000000..6a645c8 Binary files /dev/null and b/assets/img/spring/webmvc-architecture.drawio.png differ diff --git a/assets/img/warehouse-and-snowflake/b-tree.drawio b/assets/img/warehouse-and-snowflake/b-tree.drawio new file mode 100644 index 0000000..7466061 --- /dev/null +++ b/assets/img/warehouse-and-snowflake/b-tree.drawio @@ -0,0 +1 @@ +7VZNb9swDP01BrZDh/gjaXNM7G7doUGxHNaeBsFibaGyFShKbO/Xj6rkD9kpuhYDOhS9WOITRYp8z7K9MC7qb5Ls8mtBgXvBjNZemHhBcLG4wKcGGgPM/cgAmWTUQH4PbNlvsODMogdGYe84KiG4YjsXTEVZQqocjEgpKtftXnA3645kMAG2KeFT9CejKrdlBec9fgUsy9vM/mJpVgrSOttK9jmhohpA4aUXxlIIZWZFHQPXvWv7YvZ9fWK1O5iEUv3NhmSxuf21fLhJo/PrDVnRIN58P7NRjoQfbMHxCu1PiMc4BmYIzRB9tpWopm2PghqTr3NVcAR8nO6VFA8QCy4kIqUo0XN9zzgfQYSzrEQzxeMD4usjSMWw8Su7UDBKdZp1lTMF2x1Jdc4KVYaYFIeSgq5shpYtAgNA/WR3/K7nqFUQBSjZoEsrVMuSlWlkzWrAuYXyAd2hxYhVWdbF7YnAieXiBbwEJ3j5kQyYec9cRC4X/uyNyYhOkLG90mS078c7JsMP/jM25hM2Js2Hkq703a+7yMl+z1KXCrdJUDN1q+df5ta6s356ntQDt6RpjRJLGWzS5t1wrd/2aDUOIUAnH50RHViOOMgUnr+6FZEZqOeukim9A/rmJ+hrMQmcKHZ0j3uKU5vhRjAspFNPuHTV00mlDWHKtLuGX69RoGh0KUz0ZfowCfQosa7s16tu8aG6F6kuelPVLUZiWb5Wdf4oUPSvVIdm/+tn3Pv/5/DyDw== \ No newline at end of file diff --git a/assets/img/warehouse-and-snowflake/b-tree.drawio.png b/assets/img/warehouse-and-snowflake/b-tree.drawio.png new file mode 100644 index 0000000..85be4c2 Binary files /dev/null and b/assets/img/warehouse-and-snowflake/b-tree.drawio.png differ diff --git a/assets/img/warehouse-and-snowflake/data-warehouse.drawio b/assets/img/warehouse-and-snowflake/data-warehouse.drawio new file mode 100644 index 0000000..0af1f00 --- /dev/null +++ b/assets/img/warehouse-and-snowflake/data-warehouse.drawio @@ -0,0 +1,112 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/img/warehouse-and-snowflake/data-warehouse.png b/assets/img/warehouse-and-snowflake/data-warehouse.png new file mode 100644 index 0000000..6057f9e Binary files /dev/null and b/assets/img/warehouse-and-snowflake/data-warehouse.png differ diff --git a/assets/img/warehouse-and-snowflake/role-hierarchy.png b/assets/img/warehouse-and-snowflake/role-hierarchy.png new file mode 100644 index 0000000..bd11e70 Binary files /dev/null and b/assets/img/warehouse-and-snowflake/role-hierarchy.png differ diff --git a/assets/img/warehouse-and-snowflake/warehouse-architecture.jpg b/assets/img/warehouse-and-snowflake/warehouse-architecture.jpg new file mode 100644 index 0000000..5e1a7b1 Binary files /dev/null and b/assets/img/warehouse-and-snowflake/warehouse-architecture.jpg differ diff --git a/assets/js/data/search.json b/assets/js/data/search.json new file mode 100644 index 0000000..2601ed0 --- /dev/null +++ b/assets/js/data/search.json @@ -0,0 +1,20 @@ +--- +layout: compress +swcache: true +--- + +[ + {% for post in site.posts %} + { + "title": {{ post.title | jsonify }}, + "url": {{ post.url | relative_url | jsonify }}, + "categories": {{ post.categories | join: ', ' | jsonify }}, + "tags": {{ post.tags | join: ', ' | jsonify }}, + "date": "{{ post.date }}", + {% include no-linenos.html content=post.content %} + {% assign _content = content | strip_html | strip_newlines %} + "snippet": {{ _content | truncate: 200 | jsonify }}, + "content": {{ _content | jsonify }} + }{% unless forloop.last %},{% endunless %} + {% endfor %} +] diff --git a/assets/js/data/swcache.js b/assets/js/data/swcache.js new file mode 100644 index 0000000..9ff3899 --- /dev/null +++ b/assets/js/data/swcache.js @@ -0,0 +1,50 @@ +--- +layout: compress + +# The list to be cached by PWA +--- + +const resource = [ + /* --- CSS --- */ + '{{ "/assets/css/style.css" | relative_url }}', + + /* --- PWA --- */ + '{{ "/app.js" | relative_url }}', + '{{ "/sw.js" | relative_url }}', + + /* --- HTML --- */ + '{{ "/index.html" | relative_url }}', + '{{ "/404.html" | relative_url }}', + + {% for tab in site.tabs %} + '{{ tab.url | relative_url }}', + {% endfor %} + + /* --- Favicons & compressed JS --- */ + {% assign cache_list = site.static_files | where: 'swcache', true %} + {% for file in cache_list %} + '{{ file.path | relative_url }}'{%- unless forloop.last -%},{%- endunless -%} + {% endfor %} +]; + +/* The request url with below domain will be cached */ +const allowedDomains = [ + {% if site.google_analytics.id != empty and site.google_analytics.id %} + 'www.googletagmanager.com', + 'www.google-analytics.com', + {% endif %} + + '{{ site.url | split: "//" | last }}', + + {% if site.img_cdn contains '//' and site.img_cdn %} + '{{ site.img_cdn | split: '//' | last | split: '/' | first }}', + {% endif %} + + 'fonts.gstatic.com', + 'fonts.googleapis.com', + 'cdn.jsdelivr.net', + 'polyfill.io' +]; + +/* Requests that include the following path will be banned */ +const denyUrls = []; diff --git a/assets/js/dist/categories.min.js b/assets/js/dist/categories.min.js new file mode 100644 index 0000000..bebf80f --- /dev/null +++ b/assets/js/dist/categories.min.js @@ -0,0 +1,6 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy/) + * © 2019 Cotes Chung + * MIT Licensed + */ +!function(){"use strict";function e(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}function t(e,t){for(var r=0;re.length)&&(t=e.length);for(var r=0,o=new Array(t);r.row"),v=$("#topbar-title"),m=$("#search-wrapper"),g=$("#search-result-wrapper"),y=$("#search-results"),h=$("#search-input"),C=$("#search-hints"),w=$("html,body"),k="loaded",A="unloaded",S="input-focus",T="d-flex",j=function(){function t(){e(this,t)}return r(t,null,[{key:"on",value:function(){t.offset=window.scrollY,w.scrollTop(0)}},{key:"off",value:function(){w.scrollTop(t.offset)}}]),t}();o(j,"offset",0),o(j,"resultVisible",!1);var E=function(){function t(){e(this,t)}return r(t,null,[{key:"on",value:function(){f.addClass(A),v.addClass(A),d.addClass(A),m.addClass(T),p.addClass(k)}},{key:"off",value:function(){p.removeClass(k),m.removeClass(T),f.removeClass(A),v.removeClass(A),d.removeClass(A)}}]),t}(),O=function(){function t(){e(this,t)}return r(t,null,[{key:"on",value:function(){j.resultVisible||(j.on(),g.removeClass(A),b.addClass(A),j.resultVisible=!0)}},{key:"off",value:function(){j.resultVisible&&(y.empty(),C.hasClass(A)&&C.removeClass(A),g.addClass(A),b.removeClass(A),j.off(),h.val(""),j.resultVisible=!1)}}]),t}();function x(){return p.hasClass(k)}var P=$(".collapse");var V,I;$(".code-header>button").children().attr("class"),V=$(window),I=$("#back-to-top"),V.on("scroll",(function(){V.scrollTop()>50?I.fadeIn():I.fadeOut()})),I.on("click",(function(){V.scrollTop(0)})),n(document.querySelectorAll('[data-bs-toggle="tooltip"]')).map((function(e){return new bootstrap.Tooltip(e)})),0!==i.length&&i.off().on("click",(function(e){var t=$(e.target),r=t.prop("tagName")==="button".toUpperCase()?t:t.parent();modeToggle.flipMode(),r.trigger("blur")})),$("#sidebar-trigger").on("click",c.toggle),$("#mask").on("click",c.toggle),d.on("click",(function(){E.on(),O.on(),h.trigger("focus")})),p.on("click",(function(){E.off(),O.off()})),h.on("focus",(function(){m.addClass(S)})),h.on("focusout",(function(){m.removeClass(S)})),h.on("input",(function(){""===h.val()?x()?C.removeClass(A):O.off():(O.on(),x()&&C.addClass(A))})),P.on("hide.bs.collapse",(function(){var e="h_"+$(this).attr("id").substring(2);e&&($("#".concat(e," .far.fa-folder-open")).attr("class","far fa-folder fa-fw"),$("#".concat(e," i.fas")).addClass("rotate"),$("#".concat(e)).removeClass("hide-border-bottom"))})),P.on("show.bs.collapse",(function(){var e="h_"+$(this).attr("id").substring(2);e&&($("#".concat(e," .far.fa-folder")).attr("class","far fa-folder-open fa-fw"),$("#".concat(e," i.fas")).removeClass("rotate"),$("#".concat(e)).addClass("hide-border-bottom"))}))}(); diff --git a/assets/js/dist/commons.min.js b/assets/js/dist/commons.min.js new file mode 100644 index 0000000..97d930b --- /dev/null +++ b/assets/js/dist/commons.min.js @@ -0,0 +1,6 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy/) + * © 2019 Cotes Chung + * MIT Licensed + */ +!function(){"use strict";function e(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}function t(e,t){for(var r=0;re.length)&&(t=e.length);for(var r=0,n=new Array(t);r.row"),m=$("#topbar-title"),v=$("#search-wrapper"),y=$("#search-result-wrapper"),g=$("#search-results"),h=$("#search-input"),C=$("#search-hints"),w=$("html,body"),k="loaded",A="unloaded",S="input-focus",T="d-flex",j=function(){function t(){e(this,t)}return r(t,null,[{key:"on",value:function(){t.offset=window.scrollY,w.scrollTop(0)}},{key:"off",value:function(){w.scrollTop(t.offset)}}]),t}();n(j,"offset",0),n(j,"resultVisible",!1);var E,O,x=function(){function t(){e(this,t)}return r(t,null,[{key:"on",value:function(){c.addClass(A),m.addClass(A),d.addClass(A),v.addClass(T),p.addClass(k)}},{key:"off",value:function(){p.removeClass(k),v.removeClass(T),c.removeClass(A),m.removeClass(A),d.removeClass(A)}}]),t}(),P=function(){function t(){e(this,t)}return r(t,null,[{key:"on",value:function(){j.resultVisible||(j.on(),y.removeClass(A),b.addClass(A),j.resultVisible=!0)}},{key:"off",value:function(){j.resultVisible&&(g.empty(),C.hasClass(A)&&C.removeClass(A),y.addClass(A),b.removeClass(A),j.off(),h.val(""),j.resultVisible=!1)}}]),t}();function V(){return p.hasClass(k)}E=$(window),O=$("#back-to-top"),E.on("scroll",(function(){E.scrollTop()>50?O.fadeIn():O.fadeOut()})),O.on("click",(function(){E.scrollTop(0)})),o(document.querySelectorAll('[data-bs-toggle="tooltip"]')).map((function(e){return new bootstrap.Tooltip(e)})),0!==l.length&&l.off().on("click",(function(e){var t=$(e.target),r=t.prop("tagName")==="button".toUpperCase()?t:t.parent();modeToggle.flipMode(),r.trigger("blur")})),$("#sidebar-trigger").on("click",f.toggle),$("#mask").on("click",f.toggle),d.on("click",(function(){x.on(),P.on(),h.trigger("focus")})),p.on("click",(function(){x.off(),P.off()})),h.on("focus",(function(){v.addClass(S)})),h.on("focusout",(function(){v.removeClass(S)})),h.on("input",(function(){""===h.val()?V()?C.removeClass(A):P.off():(P.on(),V()&&C.addClass(A))}))}(); diff --git a/assets/js/dist/home.min.js b/assets/js/dist/home.min.js new file mode 100644 index 0000000..f8cd3f1 --- /dev/null +++ b/assets/js/dist/home.min.js @@ -0,0 +1,6 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy/) + * © 2019 Cotes Chung + * MIT Licensed + */ +!function(){"use strict";function t(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function e(t,e){for(var r=0;rt.length)&&(e=t.length);for(var r=0,n=new Array(e);r.row"),g=$("#topbar-title"),v=$("#search-wrapper"),y=$("#search-result-wrapper"),b=$("#search-results"),h=$("#search-input"),C=$("#search-hints"),w=$("html,body"),k="loaded",T="unloaded",j="input-focus",A="d-flex",S=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){e.offset=window.scrollY,w.scrollTop(0)}},{key:"off",value:function(){w.scrollTop(e.offset)}}]),e}();n(S,"offset",0),n(S,"resultVisible",!1);var x=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){f.addClass(T),g.addClass(T),d.addClass(T),v.addClass(A),m.addClass(k)}},{key:"off",value:function(){m.removeClass(k),v.removeClass(A),f.removeClass(T),g.removeClass(T),d.removeClass(T)}}]),e}(),E=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){S.resultVisible||(S.on(),y.removeClass(T),p.addClass(T),S.resultVisible=!0)}},{key:"off",value:function(){S.resultVisible&&(b.empty(),C.hasClass(T)&&C.removeClass(T),y.addClass(T),p.removeClass(T),S.off(),h.val(""),S.resultVisible=!1)}}]),e}();function F(){return m.hasClass(k)}$(".collapse");function O(t){t.parent().removeClass("shimmer")}$(".code-header>button").children().attr("class");var D,P,V,I=function(){function e(){t(this,e)}return r(e,null,[{key:"attrTimestamp",get:function(){return"data-ts"}},{key:"attrDateFormat",get:function(){return"data-df"}},{key:"locale",get:function(){return $("html").attr("lang").substring(0,2)}},{key:"getTimestamp",value:function(t){return Number(t.attr(e.attrTimestamp))}},{key:"getDateFormat",value:function(t){return t.attr(e.attrDateFormat)}}]),e}();D=$(window),P=$("#back-to-top"),D.on("scroll",(function(){D.scrollTop()>50?P.fadeIn():P.fadeOut()})),P.on("click",(function(){D.scrollTop(0)})),o(document.querySelectorAll('[data-bs-toggle="tooltip"]')).map((function(t){return new bootstrap.Tooltip(t)})),0!==l.length&&l.off().on("click",(function(t){var e=$(t.target),r=e.prop("tagName")==="button".toUpperCase()?e:e.parent();modeToggle.flipMode(),r.trigger("blur")})),$("#sidebar-trigger").on("click",c.toggle),$("#mask").on("click",c.toggle),d.on("click",(function(){x.on(),E.on(),h.trigger("focus")})),m.on("click",(function(){x.off(),E.off()})),h.on("focus",(function(){v.addClass(j)})),h.on("focusout",(function(){v.removeClass(j)})),h.on("input",(function(){""===h.val()?F()?C.removeClass(T):E.off():(E.on(),F()&&C.addClass(T))})),dayjs.locale(I.locale),dayjs.extend(window.dayjs_plugin_localizedFormat),$("[".concat(I.attrTimestamp,"]")).each((function(){var t=dayjs.unix(I.getTimestamp($(this))),e=t.format(I.getDateFormat($(this)));$(this).text(e),$(this).removeAttr(I.attrTimestamp),$(this).removeAttr(I.attrDateFormat);var r=$(this).attr("data-bs-toggle");if(void 0!==r&&"tooltip"===r){var n=t.format("llll");$(this).attr("data-bs-title",n),new bootstrap.Tooltip($(this))}})),(V=$("#core-wrapper img[data-src]")).length<=0||(document.addEventListener("lazyloaded",(function(t){O($(t.target))})),V.each((function(){$(this).hasClass("ls-is-cached")&&O($(this))})))}(); diff --git a/assets/js/dist/misc.min.js b/assets/js/dist/misc.min.js new file mode 100644 index 0000000..f365a6f --- /dev/null +++ b/assets/js/dist/misc.min.js @@ -0,0 +1,6 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy/) + * © 2019 Cotes Chung + * MIT Licensed + */ +!function(){"use strict";function t(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function e(t,e){for(var r=0;rt.length)&&(e=t.length);for(var r=0,n=new Array(e);r.row"),v=$("#topbar-title"),g=$("#search-wrapper"),y=$("#search-result-wrapper"),b=$("#search-results"),h=$("#search-input"),C=$("#search-hints"),k=$("html,body"),w="loaded",T="unloaded",j="input-focus",A="d-flex",S=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){e.offset=window.scrollY,k.scrollTop(0)}},{key:"off",value:function(){k.scrollTop(e.offset)}}]),e}();n(S,"offset",0),n(S,"resultVisible",!1);var x=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){f.addClass(T),v.addClass(T),d.addClass(T),g.addClass(A),m.addClass(w)}},{key:"off",value:function(){m.removeClass(w),g.removeClass(A),f.removeClass(T),v.removeClass(T),d.removeClass(T)}}]),e}(),E=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){S.resultVisible||(S.on(),y.removeClass(T),p.addClass(T),S.resultVisible=!0)}},{key:"off",value:function(){S.resultVisible&&(b.empty(),C.hasClass(T)&&C.removeClass(T),y.addClass(T),p.removeClass(T),S.off(),h.val(""),S.resultVisible=!1)}}]),e}();function F(){return m.hasClass(w)}$(".collapse");$(".code-header>button").children().attr("class");var O,D,P=function(){function e(){t(this,e)}return r(e,null,[{key:"attrTimestamp",get:function(){return"data-ts"}},{key:"attrDateFormat",get:function(){return"data-df"}},{key:"locale",get:function(){return $("html").attr("lang").substring(0,2)}},{key:"getTimestamp",value:function(t){return Number(t.attr(e.attrTimestamp))}},{key:"getDateFormat",value:function(t){return t.attr(e.attrDateFormat)}}]),e}();O=$(window),D=$("#back-to-top"),O.on("scroll",(function(){O.scrollTop()>50?D.fadeIn():D.fadeOut()})),D.on("click",(function(){O.scrollTop(0)})),o(document.querySelectorAll('[data-bs-toggle="tooltip"]')).map((function(t){return new bootstrap.Tooltip(t)})),0!==l.length&&l.off().on("click",(function(t){var e=$(t.target),r=e.prop("tagName")==="button".toUpperCase()?e:e.parent();modeToggle.flipMode(),r.trigger("blur")})),$("#sidebar-trigger").on("click",c.toggle),$("#mask").on("click",c.toggle),d.on("click",(function(){x.on(),E.on(),h.trigger("focus")})),m.on("click",(function(){x.off(),E.off()})),h.on("focus",(function(){g.addClass(j)})),h.on("focusout",(function(){g.removeClass(j)})),h.on("input",(function(){""===h.val()?F()?C.removeClass(T):E.off():(E.on(),F()&&C.addClass(T))})),dayjs.locale(P.locale),dayjs.extend(window.dayjs_plugin_localizedFormat),$("[".concat(P.attrTimestamp,"]")).each((function(){var t=dayjs.unix(P.getTimestamp($(this))),e=t.format(P.getDateFormat($(this)));$(this).text(e),$(this).removeAttr(P.attrTimestamp),$(this).removeAttr(P.attrDateFormat);var r=$(this).attr("data-bs-toggle");if(void 0!==r&&"tooltip"===r){var n=t.format("llll");$(this).attr("data-bs-title",n),new bootstrap.Tooltip($(this))}}))}(); diff --git a/assets/js/dist/page.min.js b/assets/js/dist/page.min.js new file mode 100644 index 0000000..dcce2df --- /dev/null +++ b/assets/js/dist/page.min.js @@ -0,0 +1,6 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy/) + * © 2019 Cotes Chung + * MIT Licensed + */ +!function(){"use strict";function t(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function e(t,e){for(var n=0;nt.length)&&(e=t.length);for(var n=0,r=new Array(e);n.row"),v=$("#topbar-title"),g=$("#search-wrapper"),b=$("#search-result-wrapper"),h=$("#search-results"),y=$("#search-input"),C=$("#search-hints"),w=$("html,body"),k="loaded",S="unloaded",A="input-focus",T="d-flex",E=function(){function e(){t(this,e)}return n(e,null,[{key:"on",value:function(){e.offset=window.scrollY,w.scrollTop(0)}},{key:"off",value:function(){w.scrollTop(e.offset)}}]),e}();r(E,"offset",0),r(E,"resultVisible",!1);var j=function(){function e(){t(this,e)}return n(e,null,[{key:"on",value:function(){f.addClass(S),v.addClass(S),d.addClass(S),g.addClass(T),p.addClass(k)}},{key:"off",value:function(){p.removeClass(k),g.removeClass(T),f.removeClass(S),v.removeClass(S),d.removeClass(S)}}]),e}(),x=function(){function e(){t(this,e)}return n(e,null,[{key:"on",value:function(){E.resultVisible||(E.on(),b.removeClass(S),m.addClass(S),E.resultVisible=!0)}},{key:"off",value:function(){E.resultVisible&&(h.empty(),C.hasClass(S)&&C.removeClass(S),b.addClass(S),m.removeClass(S),E.off(),y.val(""),E.resultVisible=!1)}}]),e}();function O(){return p.hasClass(k)}$(".collapse");var P=".code-header>button",V="fas fa-check",I="timeout",N="data-title-succeed",q="data-bs-original-title",z=2e3;function D(t){if($(t)[0].hasAttribute(I)){var e=$(t).attr(I);if(Number(e)>Date.now())return!0}return!1}function M(t){$(t).attr(I,Date.now()+z)}function U(t){$(t).removeAttr(I)}var B,J,L,Y=$(P).children().attr("class");function F(t){t.parent().removeClass("shimmer")}B=$(window),J=$("#back-to-top"),B.on("scroll",(function(){B.scrollTop()>50?J.fadeIn():J.fadeOut()})),J.on("click",(function(){B.scrollTop(0)})),o(document.querySelectorAll('[data-bs-toggle="tooltip"]')).map((function(t){return new bootstrap.Tooltip(t)})),0!==l.length&&l.off().on("click",(function(t){var e=$(t.target),n=e.prop("tagName")==="button".toUpperCase()?e:e.parent();modeToggle.flipMode(),n.trigger("blur")})),$("#sidebar-trigger").on("click",c.toggle),$("#mask").on("click",c.toggle),d.on("click",(function(){j.on(),x.on(),y.trigger("focus")})),p.on("click",(function(){j.off(),x.off()})),y.on("focus",(function(){g.addClass(A)})),y.on("focusout",(function(){g.removeClass(A)})),y.on("input",(function(){""===y.val()?O()?C.removeClass(S):x.off():(x.on(),O()&&C.addClass(S))})),(L=$("#core-wrapper img[data-src]")).length<=0||(document.addEventListener("lazyloaded",(function(t){F($(t.target))})),L.each((function(){$(this).hasClass("ls-is-cached")&&F($(this))}))),$(".popup")<=0||$(".popup").magnificPopup({type:"image",closeOnContentClick:!0,showCloseBtn:!1,zoom:{enabled:!0,duration:300,easing:"ease-in-out"}}),function(){if($(P).length){var t=new ClipboardJS(P,{target:function(t){return t.parentNode.nextElementSibling.querySelector("code .rouge-code")}});o(document.querySelectorAll(P)).map((function(t){return new bootstrap.Tooltip(t,{placement:"left"})})),t.on("success",(function(t){t.clearSelection();var e=t.trigger;D(e)||(!function(t){$(t).children().attr("class",V)}(e),function(t){var e=$(t).attr(N);$(t).attr(q,e).tooltip("show")}(e),M(e),setTimeout((function(){!function(t){$(t).tooltip("hide").removeAttr(q)}(e),function(t){$(t).children().attr("class",Y)}(e),U(e)}),z))}))}$("#copy-link").on("click",(function(t){var e=$(t.target);D(e)||navigator.clipboard.writeText(window.location.href).then((function(){var t=e.attr(q),n=e.attr(N);e.attr(q,n).tooltip("show"),M(e),setTimeout((function(){e.attr(q,t),U(e)}),z)}))}))}()}(); diff --git a/assets/js/dist/post.min.js b/assets/js/dist/post.min.js new file mode 100644 index 0000000..916e367 --- /dev/null +++ b/assets/js/dist/post.min.js @@ -0,0 +1,6 @@ +/*! + * Chirpy v6.1.0 (https://github.com/cotes2020/jekyll-theme-chirpy/) + * © 2019 Cotes Chung + * MIT Licensed + */ +!function(){"use strict";function t(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function e(t,e){for(var r=0;rt.length)&&(e=t.length);for(var r=0,n=new Array(e);r.row"),g=$("#topbar-title"),v=$("#search-wrapper"),h=$("#search-result-wrapper"),b=$("#search-results"),y=$("#search-input"),w=$("#search-hints"),C=$("html,body"),k="loaded",S="unloaded",T="input-focus",A="d-flex",j=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){e.offset=window.scrollY,C.scrollTop(0)}},{key:"off",value:function(){C.scrollTop(e.offset)}}]),e}();n(j,"offset",0),n(j,"resultVisible",!1);var x=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){f.addClass(S),g.addClass(S),d.addClass(S),v.addClass(A),p.addClass(k)}},{key:"off",value:function(){p.removeClass(k),v.removeClass(A),f.removeClass(S),g.removeClass(S),d.removeClass(S)}}]),e}(),E=function(){function e(){t(this,e)}return r(e,null,[{key:"on",value:function(){j.resultVisible||(j.on(),h.removeClass(S),m.addClass(S),j.resultVisible=!0)}},{key:"off",value:function(){j.resultVisible&&(b.empty(),w.hasClass(S)&&w.removeClass(S),h.addClass(S),m.removeClass(S),j.off(),y.val(""),j.resultVisible=!1)}}]),e}();function D(){return p.hasClass(k)}$(".collapse");var O=".code-header>button",F="fas fa-check",P="timeout",N="data-title-succeed",V="data-bs-original-title",q=2e3;function I(t){if($(t)[0].hasAttribute(P)){var e=$(t).attr(P);if(Number(e)>Date.now())return!0}return!1}function z(t){$(t).attr(P,Date.now()+q)}function L(t){$(t).removeAttr(P)}var M=$(O).children().attr("class");function U(t){t.parent().removeClass("shimmer")}var _,B,J,Y=function(){function e(){t(this,e)}return r(e,null,[{key:"attrTimestamp",get:function(){return"data-ts"}},{key:"attrDateFormat",get:function(){return"data-df"}},{key:"locale",get:function(){return $("html").attr("lang").substring(0,2)}},{key:"getTimestamp",value:function(t){return Number(t.attr(e.attrTimestamp))}},{key:"getDateFormat",value:function(t){return t.attr(e.attrDateFormat)}}]),e}();_=$(window),B=$("#back-to-top"),_.on("scroll",(function(){_.scrollTop()>50?B.fadeIn():B.fadeOut()})),B.on("click",(function(){_.scrollTop(0)})),o(document.querySelectorAll('[data-bs-toggle="tooltip"]')).map((function(t){return new bootstrap.Tooltip(t)})),0!==l.length&&l.off().on("click",(function(t){var e=$(t.target),r=e.prop("tagName")==="button".toUpperCase()?e:e.parent();modeToggle.flipMode(),r.trigger("blur")})),$("#sidebar-trigger").on("click",u.toggle),$("#mask").on("click",u.toggle),d.on("click",(function(){x.on(),E.on(),y.trigger("focus")})),p.on("click",(function(){x.off(),E.off()})),y.on("focus",(function(){v.addClass(T)})),y.on("focusout",(function(){v.removeClass(T)})),y.on("input",(function(){""===y.val()?D()?w.removeClass(S):E.off():(E.on(),D()&&w.addClass(S))})),(J=$("#core-wrapper img[data-src]")).length<=0||(document.addEventListener("lazyloaded",(function(t){U($(t.target))})),J.each((function(){$(this).hasClass("ls-is-cached")&&U($(this))}))),$(".popup")<=0||$(".popup").magnificPopup({type:"image",closeOnContentClick:!0,showCloseBtn:!1,zoom:{enabled:!0,duration:300,easing:"ease-in-out"}}),dayjs.locale(Y.locale),dayjs.extend(window.dayjs_plugin_localizedFormat),$("[".concat(Y.attrTimestamp,"]")).each((function(){var t=dayjs.unix(Y.getTimestamp($(this))),e=t.format(Y.getDateFormat($(this)));$(this).text(e),$(this).removeAttr(Y.attrTimestamp),$(this).removeAttr(Y.attrDateFormat);var r=$(this).attr("data-bs-toggle");if(void 0!==r&&"tooltip"===r){var n=t.format("llll");$(this).attr("data-bs-title",n),new bootstrap.Tooltip($(this))}})),function(){if($(O).length){var t=new ClipboardJS(O,{target:function(t){return t.parentNode.nextElementSibling.querySelector("code .rouge-code")}});o(document.querySelectorAll(O)).map((function(t){return new bootstrap.Tooltip(t,{placement:"left"})})),t.on("success",(function(t){t.clearSelection();var e=t.trigger;I(e)||(!function(t){$(t).children().attr("class",F)}(e),function(t){var e=$(t).attr(N);$(t).attr(V,e).tooltip("show")}(e),z(e),setTimeout((function(){!function(t){$(t).tooltip("hide").removeAttr(V)}(e),function(t){$(t).children().attr("class",M)}(e),L(e)}),q))}))}$("#copy-link").on("click",(function(t){var e=$(t.target);I(e)||navigator.clipboard.writeText(window.location.href).then((function(){var t=e.attr(V),r=e.attr(N);e.attr(V,r).tooltip("show"),z(e),setTimeout((function(){e.attr(V,t),L(e)}),q)}))}))}(),document.querySelector("#core-wrapper h2,#core-wrapper h3")&&tocbot.init({tocSelector:"#toc",contentSelector:".post-content",ignoreSelector:"[data-toc-skip]",headingSelector:"h2, h3",orderedList:!1,scrollSmooth:!1})}(); diff --git a/assets/js/pwa/app.js b/assets/js/pwa/app.js new file mode 100644 index 0000000..c798fe2 --- /dev/null +++ b/assets/js/pwa/app.js @@ -0,0 +1,47 @@ +--- +layout: compress +permalink: '/app.js' +--- + +const $notification = $('#notification'); +const $btnRefresh = $('#notification .toast-body>button'); + +if ('serviceWorker' in navigator) { + /* Registering Service Worker */ + navigator.serviceWorker.register('{{ "/sw.js" | relative_url }}') + .then(registration => { + + /* in case the user ignores the notification */ + if (registration.waiting) { + $notification.toast('show'); + } + + registration.addEventListener('updatefound', () => { + registration.installing.addEventListener('statechange', () => { + if (registration.waiting) { + if (navigator.serviceWorker.controller) { + $notification.toast('show'); + } + } + }); + }); + + $btnRefresh.click(() => { + if (registration.waiting) { + registration.waiting.postMessage('SKIP_WAITING'); + } + $notification.toast('hide'); + }); + }); + + let refreshing = false; + + /* Detect controller change and refresh all the opened tabs */ + navigator.serviceWorker.addEventListener('controllerchange', () => { + if (!refreshing) { + window.location.reload(); + refreshing = true; + } + }); +} + diff --git a/assets/js/pwa/sw.js b/assets/js/pwa/sw.js new file mode 100644 index 0000000..3213b4f --- /dev/null +++ b/assets/js/pwa/sw.js @@ -0,0 +1,90 @@ +--- +layout: compress +permalink: '/sw.js' +# PWA service worker +--- + +self.importScripts('{{ "/assets/js/data/swcache.js" | relative_url }}'); + +const cacheName = 'chirpy-{{ "now" | date: "%Y%m%d.%H%M%S" }}'; + +function verifyDomain(url) { + for (const domain of allowedDomains) { + const regex = RegExp(`^http(s)?:\/\/${domain}\/`); + if (regex.test(url)) { + return true; + } + } + + return false; +} + +function isExcluded(url) { + for (const item of denyUrls) { + if (url === item) { + return true; + } + } + return false; +} + +self.addEventListener('install', event => { + event.waitUntil( + caches.open(cacheName).then(cache => { + return cache.addAll(resource); + }) + ); +}); + +self.addEventListener('activate', event => { + event.waitUntil( + caches.keys().then(keyList => { + return Promise.all( + keyList.map(key => { + if (key !== cacheName) { + return caches.delete(key); + } + }) + ); + }) + ); +}); + +self.addEventListener('message', (event) => { + if (event.data === 'SKIP_WAITING') { + self.skipWaiting(); + } +}); + +self.addEventListener('fetch', event => { + event.respondWith( + caches.match(event.request).then(response => { + if (response) { + return response; + } + + return fetch(event.request).then(response => { + const url = event.request.url; + + if (event.request.method !== 'GET' || + !verifyDomain(url) || + isExcluded(url)) { + return response; + } + + /* + see: + */ + let responseToCache = response.clone(); + + caches.open(cacheName).then(cache => { + /* console.log('[sw] Caching new resource: ' + event.request.url); */ + cache.put(event.request, responseToCache); + }); + + return response; + }); + }) + ); +}); + diff --git a/assets/js/pwa/unregister.js b/assets/js/pwa/unregister.js new file mode 100644 index 0000000..bd91150 --- /dev/null +++ b/assets/js/pwa/unregister.js @@ -0,0 +1,12 @@ +--- +layout: compress +permalink: '/unregister.js' +--- + +if ('serviceWorker' in navigator) { + navigator.serviceWorker.getRegistrations().then((registrations) => { + for (let reg of registrations) { + reg.unregister(); + } + }); +} diff --git a/assets/robots.txt b/assets/robots.txt new file mode 100644 index 0000000..45c34e0 --- /dev/null +++ b/assets/robots.txt @@ -0,0 +1,10 @@ +--- +permalink: /robots.txt +# The robots rules +--- + +User-agent: * + +Disallow: /norobots/ + +Sitemap: {{ '/sitemap.xml' | absolute_url }} diff --git a/index.html b/index.html new file mode 100644 index 0000000..1357b08 --- /dev/null +++ b/index.html @@ -0,0 +1,4 @@ +--- +layout: home +# Index page +--- diff --git a/jekyll-theme-chirpy.gemspec b/jekyll-theme-chirpy.gemspec new file mode 100644 index 0000000..c577306 --- /dev/null +++ b/jekyll-theme-chirpy.gemspec @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = "jekyll-theme-chirpy" + spec.version = "6.1.0" + spec.authors = ["Cotes Chung"] + spec.email = ["cotes.chung@gmail.com"] + + spec.summary = "A minimal, responsive and feature-rich Jekyll theme for technical writing." + spec.homepage = "https://github.com/cotes2020/jekyll-theme-chirpy" + spec.license = "MIT" + + spec.files = `git ls-files -z`.split("\x0").select { |f| + f.match(%r!^((_(includes|layouts|sass|data)|assets)\/|README|LICENSE)!i) + } + + spec.metadata = { + "bug_tracker_uri" => "https://github.com/cotes2020/jekyll-theme-chirpy/issues", + "documentation_uri" => "https://github.com/cotes2020/jekyll-theme-chirpy/#readme", + "homepage_uri" => "https://cotes2020.github.io/chirpy-demo", + "source_code_uri" => "https://github.com/cotes2020/jekyll-theme-chirpy", + "wiki_uri" => "https://github.com/cotes2020/jekyll-theme-chirpy/wiki", + "plugin_type" => "theme" + } + + spec.required_ruby_version = ">= 2.6" + + spec.add_runtime_dependency "jekyll", "~> 4.3" + spec.add_runtime_dependency "jekyll-paginate", "~> 1.1" + spec.add_runtime_dependency "jekyll-redirect-from", "~> 0.16" + spec.add_runtime_dependency "jekyll-seo-tag", "~> 2.7" + spec.add_runtime_dependency "jekyll-archives", "~> 2.2" + spec.add_runtime_dependency "jekyll-sitemap", "~> 1.4" + spec.add_runtime_dependency "jekyll-include-cache", "~> 0.2" + +end diff --git a/package.json b/package.json new file mode 100644 index 0000000..674f76e --- /dev/null +++ b/package.json @@ -0,0 +1,35 @@ +{ + "name": "jekyll-theme-chirpy", + "version": "6.1.0", + "description": "A minimal, responsive and feature-rich Jekyll theme for technical writing.", + "repository": { + "type": "git", + "url": "git+https://github.com/cotes2020/jekyll-theme-chirpy.git" + }, + "author": "Cotes Chung", + "license": "MIT", + "bugs": { + "url": "https://github.com/cotes2020/jekyll-theme-chirpy/issues" + }, + "homepage": "https://github.com/cotes2020/jekyll-theme-chirpy/", + "scripts": { + "prebuild": "npx rimraf assets/js/dist", + "build": "NODE_ENV=production npx rollup -c --bundleConfigAsCjs", + "prewatch": "npx rimraf assets/js/dist", + "watch": "npx rollup -c --bundleConfigAsCjs -w", + "test": "npx stylelint _sass/**/*.scss", + "fixlint": "npm run test -- --fix" + }, + "devDependencies": { + "@babel/core": "^7.21.3", + "@babel/plugin-proposal-class-properties": "^7.18.6", + "@babel/preset-env": "^7.20.2", + "@rollup/plugin-babel": "^6.0.3", + "@rollup/plugin-terser": "^0.4.0", + "rimraf": "^5.0.1", + "rollup": "^3.20.2", + "rollup-plugin-license": "^3.0.1", + "stylelint": "^15.3.0", + "stylelint-config-standard-scss": "^9.0.0" + } +} diff --git a/rollup.config.js b/rollup.config.js new file mode 100644 index 0000000..907ca3e --- /dev/null +++ b/rollup.config.js @@ -0,0 +1,46 @@ +import babel from '@rollup/plugin-babel'; +import terser from '@rollup/plugin-terser'; +import license from 'rollup-plugin-license'; +import path from 'path'; + +const JS_SRC = '_javascript'; +const JS_DIST = 'assets/js/dist'; +const isProd = process.env.NODE_ENV === 'production'; + +function build(filename) { + return { + input: [`${JS_SRC}/${filename}.js`], + output: { + file: `${JS_DIST}/${filename}.min.js`, + format: 'iife', + name: 'Chirpy', + sourcemap: !isProd + }, + watch: { + include: `${JS_SRC}/**` + }, + plugins: [ + babel({ + babelHelpers: 'bundled', + presets: ['@babel/env'], + plugins: ['@babel/plugin-proposal-class-properties'] + }), + license({ + banner: { + commentStyle: 'ignored', + content: { file: path.join(__dirname, JS_SRC, '_copyright') } + } + }), + isProd && terser() + ] + }; +} + +export default [ + build('commons'), + build('home'), + build('categories'), + build('page'), + build('post'), + build('misc') +]; diff --git a/tools/init b/tools/init new file mode 100755 index 0000000..5baac5d --- /dev/null +++ b/tools/init @@ -0,0 +1,141 @@ +#!/usr/bin/env bash +# +# Init the environment for new user. + +set -eu + +# CLI Dependencies +CLI=("git" "npm") + +ACTIONS_WORKFLOW=pages-deploy.yml + +# temporary file suffixes that make `sed -i` compatible with BSD and Linux +TEMP_SUFFIX="to-delete" + +_no_gh=false + +help() { + echo "Usage:" + echo + echo " bash /path/to/init [options]" + echo + echo "Options:" + echo " --no-gh Do not deploy to Github." + echo " -h, --help Print this help information." +} + +# BSD and GNU compatible sed +_sedi() { + regex=$1 + file=$2 + sed -i.$TEMP_SUFFIX "$regex" "$file" + rm -f "$file".$TEMP_SUFFIX +} + +_check_cli() { + for i in "${!CLI[@]}"; do + cli="${CLI[$i]}" + if ! command -v "$cli" &>/dev/null; then + echo "Command '$cli' not found! Hint: you should install it." + exit 1 + fi + done +} + +_check_status() { + if [[ -n $(git status . -s) ]]; then + echo "Error: Commit unstaged files first, and then run this tool again." + exit 1 + fi +} + +_check_init() { + local _has_inited=false + + if [[ ! -d .github ]]; then # using option `--no-gh` + _has_inited=true + else + if [[ -f .github/workflows/$ACTIONS_WORKFLOW ]]; then + # on BSD, the `wc` could contains blank + local _count + _count=$(find .github/workflows/ -type f -name "*.yml" | wc -l) + if [[ ${_count//[[:blank:]]/} == 1 ]]; then + _has_inited=true + fi + fi + fi + + if $_has_inited; then + echo "Already initialized." + exit 0 + fi +} + +check_env() { + _check_cli + _check_status + _check_init +} + +checkout_latest_release() { + hash=$(git log --grep="chore(release):" -1 --pretty="%H") + git reset --hard "$hash" +} + +init_files() { + if $_no_gh; then + rm -rf .github + else + ## Change the files of `.github` + mv .github/workflows/$ACTIONS_WORKFLOW.hook . + rm -rf .github + mkdir -p .github/workflows + mv ./${ACTIONS_WORKFLOW}.hook .github/workflows/${ACTIONS_WORKFLOW} + + ## Cleanup image settings in site config + _sedi "s/^img_cdn:.*/img_cdn:/;s/^avatar:.*/avatar:/" _config.yml + fi + + # remove the other files + rm -rf _posts/* + + # build assets + npm i && npm run build + + # track the js output + _sedi "/^assets.*\/dist/d" .gitignore +} + +commit() { + git add -A + git commit -m "chore: initialize the environment" -q + echo -e "\n[INFO] Initialization successful!\n" +} + +main() { + check_env + checkout_latest_release + init_files + commit +} + +while (($#)); do + opt="$1" + case $opt in + --no-gh) + _no_gh=true + shift + ;; + -h | --help) + help + exit 0 + ;; + *) + # unknown option + help + exit 1 + ;; + esac +done + +main diff --git a/tools/release b/tools/release new file mode 100755 index 0000000..43182eb --- /dev/null +++ b/tools/release @@ -0,0 +1,240 @@ +#!/usr/bin/env bash +# +# Release a new version to the GitLab flow production branch. +# +# For a new major/minor version, bump version on the main branch, and then merge into the production branch. +# +# For a patch version, bump the version number on the patch branch, then merge that branch into the main branch +# and production branch. +# +# +# Usage: run on main branch or the patch branch +# +# Requires: Git, NPM and RubyGems + +set -eu + +opt_pre=false # preview mode option +opt_skip_ver=false # option for skip versioning + +working_branch="$(git branch --show-current)" + +STAGING_BRANCH="$(git symbolic-ref refs/remotes/origin/HEAD | sed 's@^refs/remotes/origin/@@')" + +PROD_BRANCH="production" + +GEM_SPEC="jekyll-theme-chirpy.gemspec" + +NODE_CONFIG="package.json" + +JS_DIST="assets/js/dist" +BACKUP_PATH="$(mktemp -d)" + +FILES=( + "_sass/jekyll-theme-chirpy.scss" + "$GEM_SPEC" + "$NODE_CONFIG" +) + +TOOLS=( + "git" + "npm" + "standard-version" + "gem" +) + +help() { + echo "A tool to release new version Chirpy gem" + echo + echo "Usage:" + echo + echo " bash ./tools/release [options]" + echo + echo "Options:" + echo " -k, --skip-versioning Skip the step of generating the version number." + echo " -p, --preview Enable preview mode, only package, and will not modify the branches" + echo " -h, --help Print this information." +} + +_check_git() { + # ensure nothing is uncommitted + if [[ -n $(git status . -s) ]]; then + echo "Abort: Commit the staged files first, and then run this tool again." + exit 1 + fi + + # ensure the working branch is the main/patch branch + if [[ $working_branch != "$STAGING_BRANCH" && $working_branch != hotfix/* ]]; then + echo "Abort: Please run on the main branch or patch branches." + exit 1 + fi +} + +_check_src() { + for i in "${!FILES[@]}"; do + _src="${FILES[$i]}" + if [[ ! -f $_src && ! -d $_src ]]; then + echo -e "Error: Missing file \"$_src\"!\n" + exit 1 + fi + done + +} + +_check_command() { + for i in "${!TOOLS[@]}"; do + cli="${TOOLS[$i]}" + if ! command -v "$cli" &>/dev/null; then + echo "Command '$cli' not found!" + exit 1 + fi + done +} + +_check_node_packages() { + if [[ ! -d node_modules || "$(du node_modules | awk '{print $1}')" == "0" ]]; then + npm i + fi +} + +check() { + _check_command + _check_git + _check_src + _check_node_packages +} + +_bump_files() { + for i in "${!FILES[@]}"; do + if [[ ${FILES[$i]} == "$NODE_CONFIG" ]]; then + continue + fi + + sed -i "s/v[[:digit:]]\+\.[[:digit:]]\+\.[[:digit:]]\+/v$1/" "${FILES[$i]}" + done + + npm run build +} + +_bump_gemspec() { + sed -i "s/[[:digit:]]\+\.[[:digit:]]\+\.[[:digit:]]\+/$1/" "$GEM_SPEC" +} + +# 1. Bump latest version number to the following files: +# +# - _sass/jekyll-theme-chirpy.scss +# - _javascript/copyright +# - assets/js/dist/*.js (will be built by gulp later) +# - jekyll-theme-chirpy.gemspec +# +# 2. Create a commit to save the changes. +bump() { + _bump_files "$1" + _bump_gemspec "$1" + + if [[ $opt_pre = false && -n $(git status . -s) ]]; then + git add . + git commit -m "chore(release): $1" + fi +} + +## Remove unnecessary theme settings +cleanup_config() { + cp _config.yml _config.yml.bak + sed -i "s/^img_cdn:.*/img_cdn:/;s/^avatar:.*/avatar:/" _config.yml +} + +resume_config() { + mv _config.yml.bak _config.yml +} + +# build a gem package +build_gem() { + echo -e "Build the gem package for v$_version\n" + cleanup_config + rm -f ./*.gem + git add "$JS_DIST" -f # add JS dist to gem + gem build "$GEM_SPEC" + cp "$JS_DIST"/* "$BACKUP_PATH" + git restore --staged "$JS_DIST" # resume the git status + resume_config +} + +# Update the git branch graph, tag, and then build the gem package. +release() { + _version="$1" # X.Y.Z + + git checkout "$PROD_BRANCH" + git merge --no-ff --no-edit "$working_branch" + + # Create a new tag on working branch + echo -e "Create tag v$_version\n" + git tag "v$_version" + + # Merge from patch branch to the staging branch + if [[ $working_branch == hotfix/* ]]; then + git checkout "$STAGING_BRANCH" + git merge --no-ff --no-edit "$working_branch" + git branch -D "$working_branch" + fi +} + +main() { + if [[ $opt_skip_ver = false ]]; then + check + + # auto-generate a new version number to the file 'package.json' + if $opt_pre; then + standard-version --prerelease rc + else + standard-version + fi + fi + + # Change heading of Patch version to level 2 (a bug from `standard-version`) + sed -i "s/^### \[/## \[/g" CHANGELOG.md + # Replace multiple empty lines with a single empty line + sed -i "/^$/N;/^\n$/D" CHANGELOG.md + + _version="$(grep '"version":' "$NODE_CONFIG" | sed 's/.*: "//;s/".*//')" + + echo -e "Bump version number to $_version\n" + bump "$_version" + + build_gem + + if [[ $opt_pre = true ]]; then + # Undo all changes on Git + git reset --hard && git clean -fd + else + release "$_version" + fi + + # restore the dist files for future development + mkdir -p "$JS_DIST" && cp "$BACKUP_PATH"/* "$JS_DIST" +} + +while (($#)); do + opt="$1" + case $opt in + -p | --preview) + opt_pre=true + shift + ;; + -k | --skip-versioning) + opt_skip_ver=true + shift + ;; + -h | --help) + help + exit 0 + ;; + *) + # unknown option + help + exit 1 + ;; + esac +done + +main diff --git a/tools/run b/tools/run new file mode 100755 index 0000000..8072e41 --- /dev/null +++ b/tools/run @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# +# Run jekyll serve and then launch the site + +bundle exec jekyll s -H 0.0.0.0 -l diff --git a/tools/test b/tools/test new file mode 100755 index 0000000..83a9490 --- /dev/null +++ b/tools/test @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# +# Build and test the site content +# +# Requirement: html-proofer, jekyll +# +# Usage: See help information + +set -eu + +SITE_DIR="_site" + +_config="_config.yml" + +_baseurl="" + +help() { + echo "Build and test the site content" + echo + echo "Usage:" + echo + echo " bash ./tools/test [options]" + echo + echo "Options:" + echo ' -c, --config "" Specify config file(s)' + echo " -h, --help Print this information." +} + +read_baseurl() { + if [[ $_config == *","* ]]; then + # multiple config + IFS="," + read -ra config_array <<<"$_config" + + # reverse loop the config files + for ((i = ${#config_array[@]} - 1; i >= 0; i--)); do + _tmp_baseurl="$(grep '^baseurl:' "${config_array[i]}" | sed "s/.*: *//;s/['\"]//g;s/#.*//")" + + if [[ -n $_tmp_baseurl ]]; then + _baseurl="$_tmp_baseurl" + break + fi + done + + else + # single config + _baseurl="$(grep '^baseurl:' "$_config" | sed "s/.*: *//;s/['\"]//g;s/#.*//")" + fi +} + +main() { + # clean up + if [[ -d $SITE_DIR ]]; then + rm -rf "$SITE_DIR" + fi + + read_baseurl + + # build + JEKYLL_ENV=production bundle exec jekyll b \ + -d "$SITE_DIR$_baseurl" -c "$_config" + + # test + bundle exec htmlproofer "$SITE_DIR" \ + --disable-external \ + --check-html \ + --allow_hash_href +} + +while (($#)); do + opt="$1" + case $opt in + -c | --config) + _config="$2" + shift + shift + ;; + -h | --help) + help + exit 0 + ;; + *) + # unknown option + help + exit 1 + ;; + esac +done + +main