diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 7b5dc0b3487..331024f2451 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -1 +1,3 @@
-Make sure that you generate site HTML with `jekyll build`, and include the changes to the HTML in your pull request also. See README.md for more information.
+Build and test your changes locally according to the instructions in [README](../README.md).
+
+Once you've done that, submit a pull request with your changes. You only need to commit your changes to the source. A GitHub Actions workflow will [generate the corresponding HTML and push it for you](./workflows/html-push.yml).
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 9e1ffb348a5..00000000000
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/.github/workflows/doc_gen.yml b/.github/workflows/doc_gen.yml
deleted file mode 100644
index 01478f92735..00000000000
--- a/.github/workflows/doc_gen.yml
+++ /dev/null
@@ -1,76 +0,0 @@
-name: Check document generation
-
-on:
- push:
- branches:
- - asf-site
- pull_request:
- branches:
- - asf-site
-
-jobs:
- lint:
- name: check whether all documentation was generated with the right Jekyll version
- runs-on: ubuntu-24.04
- steps:
- - name: Free up disk space
- shell: 'script -q -e -c "bash {0}"'
- run: |
- echo "=================================="
- echo "Free up disk space on CI system"
- echo "=================================="
-
- echo "Listing top 100 largest packages (from large to small)"
- printf "Installed-Size\tPackage\n"
- dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n -r | head -n 100
- df -h
-
- echo "Removing large packages"
- rm -rf /__t/CodeQL
- rm -rf /__t/go
- rm -rf /__t/node
-
- apt-get remove --purge -y '^aspnet.*' || true
- apt-get remove --purge -y '^dotnet-.*' || true
- apt-get remove --purge -y '^llvm-.*' || true
- apt-get remove --purge -y 'php.*' || true
- apt-get remove --purge -y '^mongodb-.*' || true
- apt-get remove --purge -y 'gfortran-11' || true
- apt-get remove --purge -y 'humanity-icon-theme' || true
- apt-get remove --purge -y 'nodejs-doc' || true
- apt-get remove --purge -y snapd google-chrome-stable microsoft-edge-stable firefox || true
- apt-get remove --purge -y azure-cli google-cloud-sdk mono-devel powershell libgl1-mesa-dri || true
- apt-get autoremove --purge -y
- apt-get clean
-
- df -h
- - name: Checkout Spark Website repository
- uses: actions/checkout@v2
- - name: Install dependencies for documentation generation
- run: |
- sudo apt-get update -y
- sudo apt-get install -y ruby ruby-dev
- sudo gem install bundler --version 2.4.19
- bundle install
- - name: Run documentation build
- run: |
- export LC_ALL=C.UTF-8
- export LANG=C.UTF-8
- OLD_IFS=$IFS
- IFS=
- GEN_ERRORS=$(bundle exec jekyll build 3>&2 2>&1 1>&3)
- if [ $(echo $GEN_ERRORS| grep -v -e '^$'| grep -c -v "rubygems_integration") -ne 0 ]; then
- echo "Error during document generation:"
- echo $GEN_ERRORS
- exit 1
- fi
- IFS=$OLD_IFS
- CHANGED_FILE=( $(git ls-files --modified --other --exclude-standard --directory | grep -v sitemap.xml | grep -v llms.txt) )
- if [ ${#CHANGED_FILE[@]} -ne 0 ]; then
- echo "Not all documentation was generated and/or not the right Jekyll version was used! Modified / untracked files (excluding sitemap.xml):"
- echo ${CHANGED_FILE[*]}
- echo "Git diff (excluding sitemap.xml):"
- git diff -- . ':(exclude)site/sitemap.xml'
- exit 1
- fi
- shell: /bin/bash {0}
diff --git a/.github/workflows/html-build.yml b/.github/workflows/html-build.yml
new file mode 100644
index 00000000000..87cd791bc75
--- /dev/null
+++ b/.github/workflows/html-build.yml
@@ -0,0 +1,21 @@
+name: Build HTML
+
+on:
+ pull_request:
+ branches:
+ - asf-site
+
+jobs:
+ build:
+ name: Build HTML
+ runs-on: ubuntu-24.04
+ steps:
+ - name: Checkout Spark Website repository
+ uses: actions/checkout@v7
+ - name: Set up Ruby and Bundler
+ uses: ruby/setup-ruby@v1
+ with:
+ ruby-version: '3.4'
+ bundler-cache: true
+ - name: Run documentation build
+ run: bundle exec jekyll build
diff --git a/.github/workflows/html-push.yml b/.github/workflows/html-push.yml
new file mode 100644
index 00000000000..3cf5662cab7
--- /dev/null
+++ b/.github/workflows/html-push.yml
@@ -0,0 +1,46 @@
+name: Build and Push HTML
+
+on:
+ push:
+ branches:
+ - asf-site
+ # TODO: Remove after testing.
+ - automated-html
+
+jobs:
+ commit:
+ name: Build and commit HTML to `asf-site`
+ # This condition is important. We don't want to trigger this job if the last
+ # commit was created _by_ this job!
+ if: "!contains(github.event.head_commit.message, '[html]')"
+ # Not technically necessary, but helps avoid spurious failures if multiple
+ # commits are pushed in rapid succession.
+ concurrency:
+ group: html-push-${{ github.ref }}
+ cancel-in-progress: true
+ runs-on: ubuntu-24.04
+ permissions:
+ contents: write
+ steps:
+ - name: Checkout Spark Website repository
+ uses: actions/checkout@v7
+ - name: Set up Ruby and Bundler
+ uses: ruby/setup-ruby@v1
+ with:
+ ruby-version: '3.4'
+ bundler-cache: true
+ - name: Run documentation build
+ run: bundle exec jekyll build
+ - name: Commit and push generated HTML
+ run: |
+ git config user.name "github-actions[bot]"
+ git config user.email "github-actions[bot]@users.noreply.github.com"
+ # `-f` because we told git to otherwise ignore `site/`
+ git add -f site/
+ if git diff --cached --quiet; then
+ echo "No changes to commit."
+ else
+ COMMIT_TITLE=$(git log -1 --pretty=%s)
+ git commit -m "[html] $COMMIT_TITLE"
+ git push
+ fi
diff --git a/.gitignore b/.gitignore
index d0d5f1c2673..90ef9e92051 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,4 @@ target/
.jekyll-cache/
.jekyll-metadata
.local_ruby_bundle
-site/python
+site/
diff --git a/README.md b/README.md
index bef477a201f..7b776039361 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,12 @@
-## Generating the website HTML
+# Apache Spark Main Website
+
+This repository captures the main Apache Spark website located at https://spark.apache.org. The programming docs are [in the main Spark repo][1], not here.
+
+[1]: https://github.com/apache/spark/tree/master/docs
+
+To contribute changes, see [CONTRIBUTING](.github/CONTRIBUTING.md).
+
+## Generating the website HTML locally
In this directory you will find text files formatted using Markdown, with an `.md` suffix.
@@ -28,22 +36,6 @@ of Spark from the Spark source repository and then copied to the website under t
directory. See the instructions for building those in the readme in the Spark
project's `/docs` directory.
-## Rouge and Pygments
-
-We also use [Rouge](https://github.com/rouge-ruby/rouge) for syntax highlighting in documentation Markdown pages.
-Its HTML output is compatible with CSS files designed for [Pygments](https://pygments.org/).
-
-To mark a block of code in your Markdown to be syntax highlighted by `jekyll` during the
-compile phase, use the following syntax:
-
- {% highlight scala %}
- // Your Scala code goes here, you can replace Scala with many other
- // supported languages too.
- {% endhighlight %}
-
-You probably don't need to install that unless you want to regenerate the Pygments CSS file.
-It requires Python, and can be installed by running `sudo easy_install Pygments`.
-
## Merge PR
-To merge pull request, use the `merge_pr.py` script which also squashes the commits.
+To merge a pull request, use the `merge_pr.py` script. This script also squashes the commits.
diff --git a/_config.yml b/_config.yml
index 98f05068c08..5e5be4c0c88 100644
--- a/_config.yml
+++ b/_config.yml
@@ -4,6 +4,6 @@ kramdown:
entity_output: symbol
permalink: none
destination: site
-exclude: ['README.md', 'content', 'LICENSE', 'merge_pr.py', 'Gemfile', 'Gemfile.lock']
+exclude: ['README.md', 'LICENSE', 'merge_pr.py', 'Gemfile', 'Gemfile.lock']
keep_files: ['docs', 'static', 'llms.txt']
url: https://spark.apache.org
diff --git a/content b/content
deleted file mode 120000
index d97e1006371..00000000000
--- a/content
+++ /dev/null
@@ -1 +0,0 @@
-site
\ No newline at end of file
diff --git a/site/sitemap.xml b/site/sitemap.xml
index 3ef7e411a11..19f2ddf2f1d 100644
--- a/site/sitemap.xml
+++ b/site/sitemap.xml
@@ -1163,7 +1163,10 @@
https://spark.apache.org/releases/spark-release-0-3.html
weekly
-
+
+ https://spark.apache.org/
+ weekly
+
https://spark.apache.org/404.html
weekly
@@ -1205,23 +1208,19 @@
weekly
- https://spark.apache.org/history.html
- weekly
-
-
- https://spark.apache.org/improvement-proposals.html
+ https://spark.apache.org/graphx/
weekly
- https://spark.apache.org/spark-connect/
+ https://spark.apache.org/history.html
weekly
- https://spark.apache.org/pandas-on-spark/
+ https://spark.apache.org/improvement-proposals.html
weekly
- https://spark.apache.org/graphx/
+ https://spark.apache.org/mailing-lists.html
weekly
@@ -1229,46 +1228,45 @@
weekly
- https://spark.apache.org/streaming/
+ https://spark.apache.org/news/
weekly
- https://spark.apache.org/news/
+ https://spark.apache.org/pandas-on-spark/
weekly
- https://spark.apache.org/screencasts/
+ https://spark.apache.org/powered-by.html
weekly
- https://spark.apache.org/sql/
+ https://spark.apache.org/release-process.html
weekly
- https://spark.apache.org/
+ https://spark.apache.org/research.html
weekly
- https://spark.apache.org/mailing-lists.html
+ https://spark.apache.org/screencasts/
weekly
- https://spark.apache.org/powered-by.html
+ https://spark.apache.org/security.html
weekly
- https://spark.apache.org/release-process.html
+ https://spark.apache.org/spark-connect/
weekly
- https://spark.apache.org/research.html
+ https://spark.apache.org/sql/
weekly
- https://spark.apache.org/security.html
+ https://spark.apache.org/streaming/
weekly
-
https://spark.apache.org/third-party-projects.html
weekly
@@ -1281,5 +1279,4 @@
https://spark.apache.org/versioning-policy.html
weekly
-
diff --git a/sitemap.xml b/sitemap.xml
index 257359ef91c..71fb8b093dc 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -151,9 +151,18 @@ sitemap: false
weekly
{% endfor %}
-{% for page in site.pages %}{% if page.sitemap != false %}
+{%- comment -%}
+Explicitly sort `site.pages` so that the order is consistent and we don't get spurious git diffs.
+`site.posts` doesn't have this issue because it's already sorted.
+See: https://jekyllrb.com/docs/variables/#site-variables
+{%- endcomment -%}
+{%- assign sorted_pages = site.pages | sort: "url" -%}
+{%- for page in sorted_pages -%}
+{%- if page.sitemap != false -%}
+
{{ site.url }}{{ page.url }}
weekly
-{% endif %}
-{% endfor %}
+
+{% endif %}
+{%- endfor -%}