图结构处理示例
在下面这个图结构处理示例中,我们创建一个图,然后删除缺失的顶点,并合并边属性。
import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.sql.SparkSession
......
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName("graphx demo")
.getOrCreate()
// 创建顶点RDD
val users = spark.sparkContext.parallelize(
Array(
(3L, ("rxin", "student")),
(7L, ("jgonzal", "postdoc")),
(5L, ("franklin", "prof")),
(2L, ("istoica", "prof")),
(4L, ("peter", "student"))
)
)
// 创建边RDD
val relationships = spark.sparkContext.parallelize(
Array(
Edge(3L, 7L, "collab"),
Edge(3L, 7L, "colleague"),
Edge(5L, 3L, "advisor"),
Edge(2L, 5L, "colleague"),
Edge(5L, 7L, "pi"),
Edge(4L, 0L, "student"),
Edge(5L, 0L, "colleague")
)
)
// 定义一个默认用户,以防与缺失的用户有关系
val defaultUser = ("John Doe", "Missing")
// 构建初始图
val graph = Graph(users, relationships, defaultUser)
// 注意有一个用户0 (我们没有它的信息)连接到用户4 (peter) 和 5 (franklin).
graph.triplets.map(
triplet => triplet.srcAttr._1 + "是" + triplet.dstAttr._1 + "的" + triplet.attr
).collect.foreach(println)
// 删除缺失的顶点以及连接到它们的边
// // 由于带有无效顶点的边也是无效的,因此过滤掉那些顶点并创建一个有效的图。
val validGraph = graph.subgraph(vpred = (vertexId, attribute) => attribute._2 != "Missing")
// 有效的子图将通过删除用户0断开用户4和用户5的连接
println("--------------------------------------")
validGraph.vertices.collect.foreach(println)
println("--------------------------------------")
validGraph.triplets.map(
triplet => triplet.srcAttr._1 + "是" + triplet.dstAttr._1 + "的" + triplet.attr
).collect.foreach(println(_))
// 合并并行边,使用groupEdges结构操作
// 导入分区策略类
import org.apache.spark.graphx.PartitionStrategy._
// 对用户图进行分区,这是对边进行分组必须的
/* CanonicalRandomVertexCut分区策略确保两个顶点之间的所有边发生共存,而不受任何方向影响。*/
val partitionedUserGraph = validGraph.partitionBy(CanonicalRandomVertexCut)
// 生成没有平行边的图形,并组合合重复边的属性
val graphWithoutParallelEdges = partitionedUserGraph.groupEdges((e1,e2) => e1 + " and " + e2)
// 输出详细信息
println("--------------------------------------")
graphWithoutParallelEdges.triplets.collect.foreach(println)
}
输出结果如下:
rxin是jgonzal的collab rxin是jgonzal的colleague franklin是rxin的advisor istoica是franklin的colleague franklin是jgonzal的pi peter是John Doe的student franklin是John Doe的colleague -------------------------------------- (2,(istoica,prof)) (3,(rxin,student)) (4,(peter,student)) (5,(franklin,prof)) (7,(jgonzal,postdoc)) -------------------------------------- rxin是jgonzal的collab rxin是jgonzal的colleague franklin是rxin的advisor istoica是franklin的colleague franklin是jgonzal的pi -------------------------------------- ((5,(franklin,prof)),(7,(jgonzal,postdoc)),pi) ((3,(rxin,student)),(7,(jgonzal,postdoc)),collab and colleague) ((2,(istoica,prof)),(5,(franklin,prof)),colleague) ((5,(franklin,prof)),(3,(rxin,student)),advisor)