Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions connect/src/main/protobuf/graphframes.proto
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ message GraphFramesAPI {
SVDPlusPlus svd_plus_plus = 18;
TriangleCount triangle_count = 19;
Triplets triplets = 20;
KCore kcore = 21;
}
}

Expand Down Expand Up @@ -186,3 +187,9 @@ message TriangleCount {
}

message Triplets {}

message KCore {
bool use_local_checkpoints = 1;
int32 checkpoint_interval = 2;
optional StorageLevel storage_level = 3;
}
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,19 @@ object GraphFramesConnectUtils {
case proto.GraphFramesAPI.MethodCase.TRIPLETS => {
graphFrame.triplets
}
case proto.GraphFramesAPI.MethodCase.KCORE => {
var kCoreBuilder =
graphFrame.kCore
.setCheckpointInterval(apiMessage.getKcore.getCheckpointInterval)
.setUseLocalCheckpoints(apiMessage.getKcore.getUseLocalCheckpoints)

if (apiMessage.getKcore.hasStorageLevel) {
kCoreBuilder = kCoreBuilder.setIntermediateStorageLevel(
parseStorageLevel(apiMessage.getKcore.getStorageLevel))
}

kCoreBuilder.run()
}
case _ => throw new GraphFramesUnreachableException() // Unreachable
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package org.apache.spark.sql.graphframes.expressions

import org.apache.spark.sql.catalyst.expressions.BinaryExpression
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.codegen.Block.*
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
import org.apache.spark.sql.catalyst.util.ArrayData
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.IntegerType

/**
* Mandal, Aritra, and Mohammad Al Hasan. "A distributed k-core decomposition algorithm on spark."
* 2017 IEEE International Conference on Big Data (Big Data). IEEE, 2017.
*
* @param left
* array of nbrs cores
* @param right
* core of the vertex
*/
case class KCoreMerge(left: Expression, right: Expression)
extends BinaryExpression
with CodegenFallback {
override protected def withNewChildrenInternal(
newLeft: Expression,
newRight: Expression): Expression = copy(newLeft, newRight)

override def dataType: DataType = IntegerType

/**
* Each node initializes its core value with the degree of itself. Each node (say u) then sends
* messages to its neighbors v ∈ N (u) with the current estimate of its (u’s) core value. For an
* undirected graph with m edges, there can be at most a total of 2m messages that have been
* sent during a message passing session. Upon receiving all the messages from its neighbors,
* the vertex u computes the largest value l such that the number of neighbors of u whose
* current core value estimate is `l` or larger is equal or higher than `l`
*/
override protected def nullSafeEval(input1: Any, input2: Any): Any = {
val arrayOfElements = input1.asInstanceOf[ArrayData].toIntArray()
val currentCore = input2.asInstanceOf[Int]

val counts = arrayOfElements.foldLeft(new Array[Int](currentCore + 1))((acc, el) =>
if (el > currentCore) {
acc(currentCore) = acc(currentCore) + 1
acc
} else {
acc(el) = acc(el) + 1
acc
})

var currentWeight = 0
for (i <- currentCore to 1 by -1) {
currentWeight += counts(i)
if (i <= currentWeight) {
return i
}
}

return 0
}

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val arrayOfElements = ctx.freshName("arrayOfElements")
val currentCore = ctx.freshName("currentCore")
val counts = ctx.freshName("counts")
val currentWeight = ctx.freshName("currentWeight")
val el = ctx.freshName("el")
val i = ctx.freshName("i")

val leftGenCode = left.genCode(ctx)
val rightGenCode = right.genCode(ctx)
ev.copy(code"""
|${leftGenCode.code}
|${rightGenCode.code}
|int ${ev.value} = 0;
|boolean ${ev.isNull} = false;
|int[] $arrayOfElements = ${leftGenCode.value}.toIntArray();
|int $currentCore = ${rightGenCode.value};
|
|int[] $counts = new int[$currentCore + 1];
|for (int $i = 0; $i < $arrayOfElements.length; $i++) {
| int $el = $arrayOfElements[$i];
| if ($el > $currentCore) {
| $counts[$currentCore] += 1;
| } else {
| $counts[$el] += 1;
| }
|}
|
|int $currentWeight = 0;
|for (int $i = $currentCore; $i >= 1; $i--) {
| $currentWeight += $counts[$i];
| if ($i <= $currentWeight) {
| ${ev.value} = $i;
| break;
| }
|}
""".stripMargin)
}
}
9 changes: 9 additions & 0 deletions core/src/main/scala/org/graphframes/GraphFrame.scala
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,15 @@ class GraphFrame private (
}
}

/**
* K-Core decomposition.
*
* See [[org.graphframes.lib.KCore]] for more details.
*
* @group stdlib
*/
def kCore: KCore = new KCore(this)

/**
* Validates the consistency and integrity of a graph by performing checks on the vertices and
* edges.
Expand Down
103 changes: 103 additions & 0 deletions core/src/main/scala/org/graphframes/lib/KCore.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package org.graphframes.lib

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions.call_function
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.collect_list
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.when
import org.apache.spark.sql.graphframes.expressions.KCoreMerge
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.storage.StorageLevel
import org.graphframes.GraphFrame
import org.graphframes.Logging
import org.graphframes.WithCheckpointInterval
import org.graphframes.WithIntermediateStorageLevel
import org.graphframes.WithLocalCheckpoints

/**
* K-Core decomposition algorithm implementation for GraphFrames.
*
* This object provides the `run` method to compute the k-core decomposition of a graph, which
* assigns each vertex the maximum k such that the vertex is part of a k-core. A k-core is a
* maximal connected subgraph in which every vertex has degree at least k.
*
* The algorithm is based on the distributed k-core decomposition approach described in:
*
* Mandal, Aritra, and Mohammad Al Hasan. "A distributed k-core decomposition algorithm on spark."
* 2017 IEEE International Conference on Big Data (Big Data). IEEE, 2017.
*/
class KCore private[graphframes] (private val graph: GraphFrame)
extends Serializable
with WithIntermediateStorageLevel
with WithCheckpointInterval
with WithLocalCheckpoints {
import org.graphframes.lib.KCore.kCoreColumnName
def run(): DataFrame = {
val result =
KCore.run(graph, intermediateStorageLevel, checkpointInterval, useLocalCheckpoints)
val allVertices = graph.vertices
.select(GraphFrame.ID)
.join(result, Seq(GraphFrame.ID), "left")
.withColumn(
kCoreColumnName,
when(col(kCoreColumnName).isNull, lit(0)).otherwise(col(kCoreColumnName)))
.persist(intermediateStorageLevel)

// materialize
allVertices.count()
result.unpersist()
allVertices
}
}

object KCore extends Serializable with Logging {
val kCoreColumnName = "kcore"
def run(
graph: GraphFrame,
storageLevel: StorageLevel,
checkpointInterval: Int,
useLocalCheckpoints: Boolean): DataFrame = {
val degrees = graph.degrees
val preparedGraph = GraphFrame(
degrees.withColumn("degree", col("degree").cast(IntegerType)),
graph.edges.select(GraphFrame.SRC, GraphFrame.DST))

val functionRegistry = graph.vertices.sparkSession.sessionState.functionRegistry
functionRegistry.registerFunction(
FunctionIdentifier("_kcoreMerge"),
(children: Seq[Expression]) => KCoreMerge(children(0), children(1)),
"scala_udf")

try {
val pregel = preparedGraph.pregel
.setMaxIter(Int.MaxValue)
.setIntermediateStorageLevel(storageLevel)
.setCheckpointInterval(checkpointInterval)
.withVertexColumn(
kCoreColumnName,
col("degree"),
call_function("_kcoreMerge", Pregel.msg, col(kCoreColumnName)))
.sendMsgToSrc(Pregel.src(kCoreColumnName))
.sendMsgToDst(Pregel.dst(kCoreColumnName))
.setInitialActiveVertexExpression(lit(true))
.setUpdateActiveVertexExpression(
col(kCoreColumnName) =!= call_function("_kcoreMerge", Pregel.msg, col(kCoreColumnName)))
.setEarlyStopping(false)
.setStopIfAllNonActiveVertices(true)
.setSkipMessagesFromNonActiveVertices(false)
.setUseLocalCheckpoints(useLocalCheckpoints)
.aggMsgs(collect_list(Pregel.msg))

pregel.run()
} finally {
val dereg = functionRegistry.dropFunction(FunctionIdentifier("_kcoreMerge"))
if (!dereg) {
logWarn(
"graphframes faced an internal error and was not able to de-register function _kcoreMerge; Spark' functionRegistry is in a bad state")
}
}
}
}
Loading