graphframes · SemyonSinchenko · Oct 27, 2025 · Oct 18, 2025
diff --git a/connect/src/main/protobuf/graphframes.proto b/connect/src/main/protobuf/graphframes.proto
@@ -35,6 +35,7 @@ message GraphFramesAPI {
     SVDPlusPlus svd_plus_plus = 18;
     TriangleCount triangle_count = 19;
     Triplets triplets = 20;
+    KCore kcore = 21;
   }
 }
 
@@ -186,3 +187,9 @@ message TriangleCount {
 }
 
 message Triplets {}
+
+message KCore {
+  bool use_local_checkpoints = 1;
+  int32 checkpoint_interval = 2;
+  optional StorageLevel storage_level = 3;
+}
diff --git a/connect/src/main/scala/org/apache/spark/sql/graphframes/GraphFramesConnectUtils.scala b/connect/src/main/scala/org/apache/spark/sql/graphframes/GraphFramesConnectUtils.scala
@@ -399,6 +399,19 @@ object GraphFramesConnectUtils {
       case proto.GraphFramesAPI.MethodCase.TRIPLETS => {
         graphFrame.triplets
       }
+      case proto.GraphFramesAPI.MethodCase.KCORE => {
+        var kCoreBuilder =
+          graphFrame.kCore
+            .setCheckpointInterval(apiMessage.getKcore.getCheckpointInterval)
+            .setUseLocalCheckpoints(apiMessage.getKcore.getUseLocalCheckpoints)
+
+        if (apiMessage.getKcore.hasStorageLevel) {
+          kCoreBuilder = kCoreBuilder.setIntermediateStorageLevel(
+            parseStorageLevel(apiMessage.getKcore.getStorageLevel))
+        }
+
+        kCoreBuilder.run()
+      }
       case _ => throw new GraphFramesUnreachableException() // Unreachable
     }
   }

diff --git a/core/src/main/scala/org/apache/spark/sql/graphframes/expressions/KCoreMerge.scala b/core/src/main/scala/org/apache/spark/sql/graphframes/expressions/KCoreMerge.scala
@@ -0,0 +1,101 @@
+package org.apache.spark.sql.graphframes.expressions
+
+import org.apache.spark.sql.catalyst.expressions.BinaryExpression
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.codegen.Block.*
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.IntegerType
+
+/**
+ * Mandal, Aritra, and Mohammad Al Hasan. "A distributed k-core decomposition algorithm on spark."
+ * 2017 IEEE International Conference on Big Data (Big Data). IEEE, 2017.
+ *
+ * @param left
+ *   array of nbrs cores
+ * @param right
+ *   core of the vertex
+ */
+case class KCoreMerge(left: Expression, right: Expression)
+    extends BinaryExpression
+    with CodegenFallback {
+  override protected def withNewChildrenInternal(
+      newLeft: Expression,
+      newRight: Expression): Expression = copy(newLeft, newRight)
+
+  override def dataType: DataType = IntegerType
+
+  /**
+   * Each node initializes its core value with the degree of itself. Each node (say u) then sends
+   * messages to its neighbors v ∈ N (u) with the current estimate of its (u’s) core value. For an
+   * undirected graph with m edges, there can be at most a total of 2m messages that have been
+   * sent during a message passing session. Upon receiving all the messages from its neighbors,
+   * the vertex u computes the largest value l such that the number of neighbors of u whose
+   * current core value estimate is `l` or larger is equal or higher than `l`
+   */
+  override protected def nullSafeEval(input1: Any, input2: Any): Any = {
+    val arrayOfElements = input1.asInstanceOf[ArrayData].toIntArray()
+    val currentCore = input2.asInstanceOf[Int]
+
+    val counts = arrayOfElements.foldLeft(new Array[Int](currentCore + 1))((acc, el) =>
+      if (el > currentCore) {
+        acc(currentCore) = acc(currentCore) + 1
+        acc
+      } else {
+        acc(el) = acc(el) + 1
+        acc
+      })
+
+    var currentWeight = 0
+    for (i <- currentCore to 1 by -1) {
+      currentWeight += counts(i)
+      if (i <= currentWeight) {
+        return i
+      }
+    }
+
+    return 0
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val arrayOfElements = ctx.freshName("arrayOfElements")
+    val currentCore = ctx.freshName("currentCore")
+    val counts = ctx.freshName("counts")
+    val currentWeight = ctx.freshName("currentWeight")
+    val el = ctx.freshName("el")
+    val i = ctx.freshName("i")
+
+    val leftGenCode = left.genCode(ctx)
+    val rightGenCode = right.genCode(ctx)
+    ev.copy(code"""
+         |${leftGenCode.code}
+         |${rightGenCode.code}
+         |int ${ev.value} = 0;
+         |boolean ${ev.isNull} = false;
+         |int[] $arrayOfElements = ${leftGenCode.value}.toIntArray();
+         |int $currentCore = ${rightGenCode.value};
+         |
+         |int[] $counts = new int[$currentCore + 1];
+         |for (int $i = 0; $i < $arrayOfElements.length; $i++) {
+         |  int $el = $arrayOfElements[$i];
+         |  if ($el > $currentCore) {
+         |    $counts[$currentCore] += 1;
+         |  } else {
+         |    $counts[$el] += 1;
+         |  }
+         |}
+         |
+         |int $currentWeight = 0;
+         |for (int $i = $currentCore; $i >= 1; $i--) {
+         |  $currentWeight += $counts[$i];
+         |  if ($i <= $currentWeight) {
+         |    ${ev.value} = $i;
+         |    break;
+         |  }
+         |}
+       """.stripMargin)
+  }
+}
diff --git a/core/src/main/scala/org/graphframes/GraphFrame.scala b/core/src/main/scala/org/graphframes/GraphFrame.scala
@@ -616,6 +616,15 @@ class GraphFrame private (
     }
   }
 
+  /**
+   * K-Core decomposition.
+   *
+   * See [[org.graphframes.lib.KCore]] for more details.
+   *
+   * @group stdlib
+   */
+  def kCore: KCore = new KCore(this)
+
   /**
    * Validates the consistency and integrity of a graph by performing checks on the vertices and
    * edges.

diff --git a/core/src/main/scala/org/graphframes/lib/KCore.scala b/core/src/main/scala/org/graphframes/lib/KCore.scala
@@ -0,0 +1,103 @@
+package org.graphframes.lib
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.functions.call_function
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.functions.collect_list
+import org.apache.spark.sql.functions.lit
+import org.apache.spark.sql.functions.when
+import org.apache.spark.sql.graphframes.expressions.KCoreMerge
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.storage.StorageLevel
+import org.graphframes.GraphFrame
+import org.graphframes.Logging
+import org.graphframes.WithCheckpointInterval
+import org.graphframes.WithIntermediateStorageLevel
+import org.graphframes.WithLocalCheckpoints
+
+/**
+ * K-Core decomposition algorithm implementation for GraphFrames.
+ *
+ * This object provides the `run` method to compute the k-core decomposition of a graph, which
+ * assigns each vertex the maximum k such that the vertex is part of a k-core. A k-core is a
+ * maximal connected subgraph in which every vertex has degree at least k.
+ *
+ * The algorithm is based on the distributed k-core decomposition approach described in:
+ *
+ * Mandal, Aritra, and Mohammad Al Hasan. "A distributed k-core decomposition algorithm on spark."
+ * 2017 IEEE International Conference on Big Data (Big Data). IEEE, 2017.
+ */
+class KCore private[graphframes] (private val graph: GraphFrame)
+    extends Serializable
+    with WithIntermediateStorageLevel
+    with WithCheckpointInterval
+    with WithLocalCheckpoints {
+  import org.graphframes.lib.KCore.kCoreColumnName
+  def run(): DataFrame = {
+    val result =
+      KCore.run(graph, intermediateStorageLevel, checkpointInterval, useLocalCheckpoints)
+    val allVertices = graph.vertices
+      .select(GraphFrame.ID)
+      .join(result, Seq(GraphFrame.ID), "left")
+      .withColumn(
+        kCoreColumnName,
+        when(col(kCoreColumnName).isNull, lit(0)).otherwise(col(kCoreColumnName)))
+      .persist(intermediateStorageLevel)
+
+    // materialize
+    allVertices.count()
+    result.unpersist()
+    allVertices
+  }
+}
+
+object KCore extends Serializable with Logging {
+  val kCoreColumnName = "kcore"
+  def run(
+      graph: GraphFrame,
+      storageLevel: StorageLevel,
+      checkpointInterval: Int,
+      useLocalCheckpoints: Boolean): DataFrame = {
+    val degrees = graph.degrees
+    val preparedGraph = GraphFrame(
+      degrees.withColumn("degree", col("degree").cast(IntegerType)),
+      graph.edges.select(GraphFrame.SRC, GraphFrame.DST))
+
+    val functionRegistry = graph.vertices.sparkSession.sessionState.functionRegistry
+    functionRegistry.registerFunction(
+      FunctionIdentifier("_kcoreMerge"),
+      (children: Seq[Expression]) => KCoreMerge(children(0), children(1)),
+      "scala_udf")
+
+    try {
+      val pregel = preparedGraph.pregel
+        .setMaxIter(Int.MaxValue)
+        .setIntermediateStorageLevel(storageLevel)
+        .setCheckpointInterval(checkpointInterval)
+        .withVertexColumn(
+          kCoreColumnName,
+          col("degree"),
+          call_function("_kcoreMerge", Pregel.msg, col(kCoreColumnName)))
+        .sendMsgToSrc(Pregel.src(kCoreColumnName))
+        .sendMsgToDst(Pregel.dst(kCoreColumnName))
+        .setInitialActiveVertexExpression(lit(true))
+        .setUpdateActiveVertexExpression(
+          col(kCoreColumnName) =!= call_function("_kcoreMerge", Pregel.msg, col(kCoreColumnName)))
+        .setEarlyStopping(false)
+        .setStopIfAllNonActiveVertices(true)
+        .setSkipMessagesFromNonActiveVertices(false)
+        .setUseLocalCheckpoints(useLocalCheckpoints)
+        .aggMsgs(collect_list(Pregel.msg))
+
+      pregel.run()
+    } finally {
+      val dereg = functionRegistry.dropFunction(FunctionIdentifier("_kcoreMerge"))
+      if (!dereg) {
+        logWarn(
+          "graphframes faced an internal error and was not able to de-register function _kcoreMerge; Spark' functionRegistry is in a bad state")
+      }
+    }
+  }
+}