1 Star 17 Fork 16

NovemberRain/pyspark_project

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
spark0401.py 2.79 KB
一键复制 编辑 原始数据 按行查看 历史
迪卡普里奥 提交于 2019-11-03 13:27 +08:00 . pyspark
from pyspark import SparkConf, SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
def my_map():
data = [1,2,3,4,5]
# 变成RDD
rdd1 = sc.parallelize(data)
rdd2 = rdd1.map(lambda x:x*2)
# 输出collect()
print(rdd2.collect())
def my_map2():
a = sc.parallelize(["dog", "tiger", "lion", "cat", "panther", " eagle"])
b = a.map(lambda x:(x,1))
print(b.collect())
def my_filter():
data = [1,2,3,4,5]
rdd1 = sc.parallelize(data)
mapRdd = rdd1.map(lambda x:x*2)
filterRdd = mapRdd.filter(lambda x:x>5)
print(filterRdd.collect())
print(sc.parallelize(data).map(lambda x:x*2).filter(lambda x:x>5).collect())
def my_flatMap():
data = ["hello spark","hello world","hello world"]
rdd = sc.parallelize(data)
print(rdd.flatMap(lambda line:line.split(" ")).collect())
def my_groupBy():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
groupByRdd = mapRdd.groupByKey()
print(groupByRdd.collect())
print(groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect())
def my_reduceByKey():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line: line.split(" ")).map(lambda x: (x, 1))
reduceByKeyRdd = mapRdd.reduceByKey(lambda a,b:a+b)
def my_sort():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line: line.split(" ")).map(lambda x: (x, 1))
reduceByKeyRdd = mapRdd.reduceByKey(lambda a, b: a + b)
reduceByKeyRdd.sortByKey(False).collect()
reduceByKeyRdd.map(lambda x:(x[1],x[0])).sortByKey(False).map(lambda x:(x[1],x[0])).collect()
def my_union():
a = sc.parallelize([1,2,3])
b = sc.parallelize([3,4,5])
print(a.union(b).collect())
def my_distinct():
a = sc.parallelize([1, 2, 3])
b = sc.parallelize([3, 4, 2])
a.union(b).distinct().collect()
def my_join():
a = sc.parallelize([("A", "a1"), ("C", "c1"), ("D", "d1"), ("F", "f1"), ("F", "f2")])
b = sc.parallelize([("A", "a2"), ("C", "c2"), ("C", "c3"), ("E", "e1")])
# a.join(b).collect
# a.leftOuterJoin(b).collect
# a.rightOuterJoin(b).collect
a.fullOuterJoin(b).collect
def my_action():
data = [1,2,3,4,5,6,7,8,9,10]
rdd = sc.parallelize(data)
rdd.collect()
rdd.reduce(lambda x,y:x+y)
rdd.foreach(lambda x:print(x))
my_union()
sc.stop()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/cucy/pyspark_project.git
git@gitee.com:cucy/pyspark_project.git
cucy
pyspark_project
pyspark_project
master

搜索帮助