Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: python-pypi-publish

on:
push:
branches:
- main
tags:
- "*.*.*"

jobs:
release:
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/graphframes-py
permissions:
id-token: write
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up JDK
if: startsWith(github.ref, 'refs/tags/')
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'zulu'

- name: Set up Python
if: startsWith(github.ref, 'refs/tags/')
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Set up Poetry
if: startsWith(github.ref, 'refs/tags/')
uses: snok/install-poetry@v1

- name: Build GraphFrames python
if: startsWith(github.ref, 'refs/tags/')
working-directory: python
run: |
poetry version ${{ github.ref_name }}
poetry build

- name: PyPi publish
if: startsWith(github.ref, 'refs/tags/')
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: python/dist
88 changes: 49 additions & 39 deletions python/dev/build_jar.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,64 @@
import shutil
import subprocess
import sys
from collections.abc import Sequence
from pathlib import Path


def build(spark_version: str = "3.5.4"):
print("Building GraphFrames JAR...")
print(f"SPARK_VERSION: {spark_version[:3]}")
assert spark_version[:3] in {"3.3", "3.4", "3.5"}, "Unsopported spark version!"
project_root = Path(__file__).parent.parent.parent
sbt_executable = project_root.joinpath("build").joinpath("sbt").absolute().__str__()
sbt_build_command = [sbt_executable, f"-Dspark.version={spark_version}", "assembly"]
sbt_build = subprocess.Popen(
sbt_build_command,
stdout=subprocess.PIPE,
universal_newlines=True,
cwd=project_root,
)
while sbt_build.poll() is None:
assert sbt_build.stdout is not None # typing stuff
line = sbt_build.stdout.readline()
print(line.rstrip(), flush=True)

if sbt_build.returncode != 0:
print("Error during the build of GraphFrames JAR!")
print("stdout: ", sbt_build.stdout)
print("stderr: ", sbt_build.stderr)
sys.exit(1)
else:
print("Building DONE successfully!")
def build(spark_versions: Sequence[str] = ["3.5.5"]):
for spark_version in spark_versions:
print("Building GraphFrames JAR...")
print(f"SPARK_VERSION: {spark_version[:3]}")
assert spark_version[:3] in {"3.5",}, "Unsopported spark version!"
project_root = Path(__file__).parent.parent.parent
sbt_executable = project_root.joinpath("build").joinpath("sbt").absolute().__str__()
sbt_build_command = [
sbt_executable,
f"-Dspark.version={spark_version}",
"clean",
"assembly",
]
sbt_build = subprocess.Popen(
sbt_build_command,
stdout=subprocess.PIPE,
universal_newlines=True,
cwd=project_root,
)
while sbt_build.poll() is None:
assert sbt_build.stdout is not None # typing stuff
line = sbt_build.stdout.readline()
print(line.rstrip(), flush=True)

if sbt_build.returncode != 0:
print("Error during the build of GraphFrames JAR!")
print("stdout: ", sbt_build.stdout)
print("stderr: ", sbt_build.stderr)
sys.exit(1)
else:
print("Building DONE successfully!")

python_resources = (
project_root.joinpath("python").joinpath("graphframes").joinpath("resources")
)
target_dir = project_root.joinpath("target").joinpath("scala-2.12")
gf_jar = None
python_resources = (
project_root.joinpath("python").joinpath("graphframes").joinpath("resources")
)
target_dir = project_root.joinpath("target").joinpath("scala-2.12")
gf_jar = None

for pp in target_dir.glob("*.jar"):
if "graphframes-assembly" in pp.name:
gf_jar = pp
break
for pp in target_dir.glob("*.jar"):
if "graphframes-assembly" in pp.name:
gf_jar = pp
break

assert gf_jar is not None, "Missing JAR!"
python_resources.mkdir(parents=True, exist_ok=True)
shutil.copy(gf_jar, python_resources.joinpath(gf_jar.name))
assert gf_jar is not None, "Missing JAR!"
shutil.rmtree(python_resources, ignore_errors=True)
python_resources.mkdir(parents=True, exist_ok=True)
shutil.copy(gf_jar, python_resources.joinpath(f"spark-{spark_version}-{gf_jar.name}"))


if __name__ == "__main__":
if len(sys.argv) > 1:
spark_version = sys.argv[1]
build(spark_version)
spark_versions = []
for arg in sys.argv[1:]:
spark_versions.append(arg)
build(spark_versions)
else:
build()
31 changes: 30 additions & 1 deletion python/graphframes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
import pathlib
from importlib import resources

from pyspark.version import __version__

from .graphframe import GraphFrame

__all__ = ["GraphFrame"]

def get_gf_jar_location() -> str:
"""
Returns a location of the GraphFrames JAR,
included to the distribution of the graphframes-py.

Usage: just add the returned value of the function to `spark.jars`:
`SparkSession.builder.master(...).config("spark.jars", get_gf_jar_location()).getOrCreate()`.

In the case your version of PySpark is not compatible with the version of GraphFrames,
this function will raise an exception!
"""
resources_root = resources.files("graphframes").joinpath("resources")

for pp in resources_root.iterdir():
assert isinstance(pp, pathlib.PosixPath) # type checking
if pp.is_file() and pp.name.endswith(".jar") and __version__[:5] in pp.name:
return str(pp.absolute())

raise ValueError(
f"You version of spark {__version__} is not supported by this version of grpahframes!"
)


__all__ = ["GraphFrame", "get_gf_jar_location"]
24 changes: 12 additions & 12 deletions python/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ include = [
]

[tool.poetry.build]
script = "dev/build_jar.py"
script = "dev/build_jar.py" # Spark version should be aligned inside the script!


[tool.poetry.urls]
Expand All @@ -35,14 +35,14 @@ script = "dev/build_jar.py"
[tool.poetry.dependencies]
python = ">=3.10 <3.13"
nose = "1.3.7"
pyspark = ">=3.4 <4.0"
pyspark = ">=3.5 <4.0"
numpy = ">= 1.7"

[tool.poetry.group.dev.dependencies]
black = "^23.12.1"
flake8 = "^7.1.1"
isort = "^6.0.0"
pyspark = { version = "3.5.4", extras = ["connect"] }
pyspark = { version = "3.5.5", extras = ["connect"] }
grpcio = "<=1.67.1"
pytest = "^8.3.4"

Expand Down
13 changes: 2 additions & 11 deletions python/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import pathlib
import shutil
import warnings
from importlib import resources

import pytest
from pyspark.sql import SparkSession
from pyspark.version import __version__

from graphframes import GraphFrame
from graphframes import GraphFrame, get_gf_jar_location
from graphframes.classic.graphframe import _java_api

if __version__[:3] >= "3.4":
Expand Down Expand Up @@ -43,15 +42,7 @@ def spark():
spark_builder = SparkSession.builder.master("local[4]").config(
"spark.sql.shuffle.partitions", 4
)
resources_root = resources.files("graphframes").joinpath("resources")
spark_jars = []
for pp in resources_root.iterdir():
assert isinstance(pp, pathlib.PosixPath) # type checking
if pp.is_file() and pp.name.endswith(".jar"):
spark_jars.append(pp.absolute().__str__())
if spark_jars:
jars_str = ",".join(spark_jars)
spark = spark_builder.config("spark.jars", jars_str)
spark = spark_builder.config("spark.jars", get_gf_jar_location())
spark = spark_builder.getOrCreate()
spark.sparkContext.setCheckpointDir(checkpointDir)
yield spark
Expand Down