diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 7b5dc0b3487..331024f2451 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1 +1,3 @@ -Make sure that you generate site HTML with `jekyll build`, and include the changes to the HTML in your pull request also. See README.md for more information. +Build and test your changes locally according to the instructions in [README](../README.md). + +Once you've done that, submit a pull request with your changes. You only need to commit your changes to the source. A GitHub Actions workflow will [generate the corresponding HTML and push it for you](./workflows/html-push.yml). diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 9e1ffb348a5..00000000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1 +0,0 @@ - diff --git a/.github/workflows/doc_gen.yml b/.github/workflows/doc_gen.yml deleted file mode 100644 index 01478f92735..00000000000 --- a/.github/workflows/doc_gen.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: Check document generation - -on: - push: - branches: - - asf-site - pull_request: - branches: - - asf-site - -jobs: - lint: - name: check whether all documentation was generated with the right Jekyll version - runs-on: ubuntu-24.04 - steps: - - name: Free up disk space - shell: 'script -q -e -c "bash {0}"' - run: | - echo "==================================" - echo "Free up disk space on CI system" - echo "==================================" - - echo "Listing top 100 largest packages (from large to small)" - printf "Installed-Size\tPackage\n" - dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n -r | head -n 100 - df -h - - echo "Removing large packages" - rm -rf /__t/CodeQL - rm -rf /__t/go - rm -rf /__t/node - - apt-get remove --purge -y '^aspnet.*' || true - apt-get remove --purge -y '^dotnet-.*' || true - apt-get remove --purge -y '^llvm-.*' || true - apt-get remove --purge -y 'php.*' || true - apt-get remove --purge -y '^mongodb-.*' || true - apt-get remove --purge -y 'gfortran-11' || true - apt-get remove --purge -y 'humanity-icon-theme' || true - apt-get remove --purge -y 'nodejs-doc' || true - apt-get remove --purge -y snapd google-chrome-stable microsoft-edge-stable firefox || true - apt-get remove --purge -y azure-cli google-cloud-sdk mono-devel powershell libgl1-mesa-dri || true - apt-get autoremove --purge -y - apt-get clean - - df -h - - name: Checkout Spark Website repository - uses: actions/checkout@v2 - - name: Install dependencies for documentation generation - run: | - sudo apt-get update -y - sudo apt-get install -y ruby ruby-dev - sudo gem install bundler --version 2.4.19 - bundle install - - name: Run documentation build - run: | - export LC_ALL=C.UTF-8 - export LANG=C.UTF-8 - OLD_IFS=$IFS - IFS= - GEN_ERRORS=$(bundle exec jekyll build 3>&2 2>&1 1>&3) - if [ $(echo $GEN_ERRORS| grep -v -e '^$'| grep -c -v "rubygems_integration") -ne 0 ]; then - echo "Error during document generation:" - echo $GEN_ERRORS - exit 1 - fi - IFS=$OLD_IFS - CHANGED_FILE=( $(git ls-files --modified --other --exclude-standard --directory | grep -v sitemap.xml | grep -v llms.txt) ) - if [ ${#CHANGED_FILE[@]} -ne 0 ]; then - echo "Not all documentation was generated and/or not the right Jekyll version was used! Modified / untracked files (excluding sitemap.xml):" - echo ${CHANGED_FILE[*]} - echo "Git diff (excluding sitemap.xml):" - git diff -- . ':(exclude)site/sitemap.xml' - exit 1 - fi - shell: /bin/bash {0} diff --git a/.github/workflows/html-build.yml b/.github/workflows/html-build.yml new file mode 100644 index 00000000000..87cd791bc75 --- /dev/null +++ b/.github/workflows/html-build.yml @@ -0,0 +1,21 @@ +name: Build HTML + +on: + pull_request: + branches: + - asf-site + +jobs: + build: + name: Build HTML + runs-on: ubuntu-24.04 + steps: + - name: Checkout Spark Website repository + uses: actions/checkout@v7 + - name: Set up Ruby and Bundler + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.4' + bundler-cache: true + - name: Run documentation build + run: bundle exec jekyll build diff --git a/.github/workflows/html-push.yml b/.github/workflows/html-push.yml new file mode 100644 index 00000000000..3cf5662cab7 --- /dev/null +++ b/.github/workflows/html-push.yml @@ -0,0 +1,46 @@ +name: Build and Push HTML + +on: + push: + branches: + - asf-site + # TODO: Remove after testing. + - automated-html + +jobs: + commit: + name: Build and commit HTML to `asf-site` + # This condition is important. We don't want to trigger this job if the last + # commit was created _by_ this job! + if: "!contains(github.event.head_commit.message, '[html]')" + # Not technically necessary, but helps avoid spurious failures if multiple + # commits are pushed in rapid succession. + concurrency: + group: html-push-${{ github.ref }} + cancel-in-progress: true + runs-on: ubuntu-24.04 + permissions: + contents: write + steps: + - name: Checkout Spark Website repository + uses: actions/checkout@v7 + - name: Set up Ruby and Bundler + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.4' + bundler-cache: true + - name: Run documentation build + run: bundle exec jekyll build + - name: Commit and push generated HTML + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + # `-f` because we told git to otherwise ignore `site/` + git add -f site/ + if git diff --cached --quiet; then + echo "No changes to commit." + else + COMMIT_TITLE=$(git log -1 --pretty=%s) + git commit -m "[html] $COMMIT_TITLE" + git push + fi diff --git a/.gitignore b/.gitignore index d0d5f1c2673..90ef9e92051 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ target/ .jekyll-cache/ .jekyll-metadata .local_ruby_bundle -site/python +site/ diff --git a/README.md b/README.md index bef477a201f..7b776039361 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,12 @@ -## Generating the website HTML +# Apache Spark Main Website + +This repository captures the main Apache Spark website located at https://spark.apache.org. The programming docs are [in the main Spark repo][1], not here. + +[1]: https://github.com/apache/spark/tree/master/docs + +To contribute changes, see [CONTRIBUTING](.github/CONTRIBUTING.md). + +## Generating the website HTML locally In this directory you will find text files formatted using Markdown, with an `.md` suffix. @@ -28,22 +36,6 @@ of Spark from the Spark source repository and then copied to the website under t directory. See the instructions for building those in the readme in the Spark project's `/docs` directory. -## Rouge and Pygments - -We also use [Rouge](https://github.com/rouge-ruby/rouge) for syntax highlighting in documentation Markdown pages. -Its HTML output is compatible with CSS files designed for [Pygments](https://pygments.org/). - -To mark a block of code in your Markdown to be syntax highlighted by `jekyll` during the -compile phase, use the following syntax: - - {% highlight scala %} - // Your Scala code goes here, you can replace Scala with many other - // supported languages too. - {% endhighlight %} - -You probably don't need to install that unless you want to regenerate the Pygments CSS file. -It requires Python, and can be installed by running `sudo easy_install Pygments`. - ## Merge PR -To merge pull request, use the `merge_pr.py` script which also squashes the commits. +To merge a pull request, use the `merge_pr.py` script. This script also squashes the commits. diff --git a/_config.yml b/_config.yml index 98f05068c08..5e5be4c0c88 100644 --- a/_config.yml +++ b/_config.yml @@ -4,6 +4,6 @@ kramdown: entity_output: symbol permalink: none destination: site -exclude: ['README.md', 'content', 'LICENSE', 'merge_pr.py', 'Gemfile', 'Gemfile.lock'] +exclude: ['README.md', 'LICENSE', 'merge_pr.py', 'Gemfile', 'Gemfile.lock'] keep_files: ['docs', 'static', 'llms.txt'] url: https://spark.apache.org diff --git a/content b/content deleted file mode 120000 index d97e1006371..00000000000 --- a/content +++ /dev/null @@ -1 +0,0 @@ -site \ No newline at end of file diff --git a/site/sitemap.xml b/site/sitemap.xml index 3ef7e411a11..19f2ddf2f1d 100644 --- a/site/sitemap.xml +++ b/site/sitemap.xml @@ -1163,7 +1163,10 @@ https://spark.apache.org/releases/spark-release-0-3.html weekly - + + https://spark.apache.org/ + weekly + https://spark.apache.org/404.html weekly @@ -1205,23 +1208,19 @@ weekly - https://spark.apache.org/history.html - weekly - - - https://spark.apache.org/improvement-proposals.html + https://spark.apache.org/graphx/ weekly - https://spark.apache.org/spark-connect/ + https://spark.apache.org/history.html weekly - https://spark.apache.org/pandas-on-spark/ + https://spark.apache.org/improvement-proposals.html weekly - https://spark.apache.org/graphx/ + https://spark.apache.org/mailing-lists.html weekly @@ -1229,46 +1228,45 @@ weekly - https://spark.apache.org/streaming/ + https://spark.apache.org/news/ weekly - https://spark.apache.org/news/ + https://spark.apache.org/pandas-on-spark/ weekly - https://spark.apache.org/screencasts/ + https://spark.apache.org/powered-by.html weekly - https://spark.apache.org/sql/ + https://spark.apache.org/release-process.html weekly - https://spark.apache.org/ + https://spark.apache.org/research.html weekly - https://spark.apache.org/mailing-lists.html + https://spark.apache.org/screencasts/ weekly - https://spark.apache.org/powered-by.html + https://spark.apache.org/security.html weekly - https://spark.apache.org/release-process.html + https://spark.apache.org/spark-connect/ weekly - https://spark.apache.org/research.html + https://spark.apache.org/sql/ weekly - https://spark.apache.org/security.html + https://spark.apache.org/streaming/ weekly - https://spark.apache.org/third-party-projects.html weekly @@ -1281,5 +1279,4 @@ https://spark.apache.org/versioning-policy.html weekly - diff --git a/sitemap.xml b/sitemap.xml index 257359ef91c..71fb8b093dc 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -151,9 +151,18 @@ sitemap: false weekly {% endfor %} -{% for page in site.pages %}{% if page.sitemap != false %} +{%- comment -%} +Explicitly sort `site.pages` so that the order is consistent and we don't get spurious git diffs. +`site.posts` doesn't have this issue because it's already sorted. +See: https://jekyllrb.com/docs/variables/#site-variables +{%- endcomment -%} +{%- assign sorted_pages = site.pages | sort: "url" -%} +{%- for page in sorted_pages -%} +{%- if page.sitemap != false -%} + {{ site.url }}{{ page.url }} weekly -{% endif %} -{% endfor %} + +{% endif %} +{%- endfor -%}