作者 | 周萝卜
来源 | 萝卜大杂烩
数据获取
https://www.visualcapitalist.com/the-50-most-visited-websites-in-the-world/
importrequests
importpandasaspd
frombs4importBeautifulSoup
headers={"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/95.0.4638.69Safari/537.36"}
res=requests.get("https://www.visualcapitalist.com/the-50-most-visited-websites-in-the-world/",headers=headers)
soup=BeautifulSoup(res.text)
tbody=soup.find("table").find("tbody")
tr_list=tbody.find_all("tr")
data_list=[]
fortrintr_list:
tds=tr.find_all("td")
tmp=[]
fortdintds:
tmp.append(td.text)
data_list.append(tmp)
print(data_list)
[['1', 'Google.com', '92.5B', 'U.S.', 'Search Engines'],
['2', 'Youtube.com', '34.6B', 'U.S.', 'TV Movies and Streaming'],
['3',
'Facebook.com',
'25.5B',
'U.S.',
'Social Networks and Online Communities'],
['4',
'Twitter.com',
'6.6B',
'U.S.',
'Social Networks and Online Communities'],
['5', 'Wikipedia.org', '6.1B', 'U.S.', 'Dictionaries and Encyclopedias'],
['6',
'Instagram.com',
'6.1B',
'U.S.',
'Social Networks and Online Communities'],
....
df=pd.DataFrame(data_list)
df.rename(columns={0:'Rank',1:'WebSite',2:'Traffic',3:'Country',4:'Type'},inplace=True)
df['new_Traffic']=df['Traffic'].apply(lambdax:x.split("B")[0]if"B"inxelsefloat(x.split("M")[0])/1000)
print(df)
Rank WebSite Traffic Country Type new_Traffic
0 1 Google.com 92.5B U.S. Search Engines 92.5
1 2 Youtube.com 34.6B U.S. TV Movies and Streaming 34.6
2 3 Facebook.com 25.5B U.S. Social Networks and Online Communities 25.5
3 4 Twitter.com 6.6B U.S. Social Networks and Online Communities 6.6
4 5 Wikipedia.org 6.1B U.S. Dictionaries and Encyclopedias 6.1
5 6 Instagram.com 6.1B U.S. Social Networks and Online Communities 6.1
6 7 Baidu.com 5.6B China Search Engines 5.6
7 8 Yahoo.com 3.8B U.S. News and Media 3.8
8 9 xvideos.com 3.4B Czech Republic Adult 3.4
9 10 pornhub.com 3.3B Canada Adult 3.3
10 11 Yandex.ru 3.2B Russia Search Engines 3.2
11 12 Whatsapp.com 3.1B U.S. Social Networks and Online Communities 3.1
12 13 Amazon.com 2.9B U.S. Marketplace 2.9
...
web_name = df['WebSite'].values.tolist()
newdf = pd.DataFrame(np.repeat(df.values,24,axis=0))
newdf.columns = df.columns
newdf['date'] = ''
for i, r in newdf.iterrows():
print(r['WebSite'])
tag = 0
ni = 0
for j in web_name[::-1]:
if r['WebSite'] == j:
print(tag)
print(ni)
r['date'] = d_list[tag:]
ni += 1
tag += 1
newdf=newdf[['WebSite','Type','new_Traffic', 'date']]
newnew = newdf.rename(columns={'WebSite':'name','Type': 'type', 'new_Traffic':'value'})
newnew.to_csv('newdf.csv', index=0)
可视化分析
frompyecharts.chartsimportBar,Map,Line,Page,Scatter,Pie,Polar
frompyechartsimportoptionsasopts
frompyecharts.globalsimportSymbolType,ThemeType
frompyecharts.chartsimportGrid,Liquid
frompyecharts.commons.utilsimportJsCode
x_data=df['WebSite'].values.tolist()[:10]
y_data=df['new_Traffic'].values.tolist()[:10]
b=(Bar()
.add_xaxis(x_data)
.add_yaxis('',y_data)
.set_global_opts(title_opts=opts.TitleOpts(),
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=30)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid=Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b,grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
x_data=df['WebSite'].values.tolist()[10:20]
y_data=df['new_Traffic'].values.tolist()[10:20]
b=(Bar()
.add_xaxis(x_data)
.add_yaxis('',y_data)
.set_global_opts(title_opts=opts.TitleOpts(),
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOps(rotate=30)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid=Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b,grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
country_group=df.groupby("Country").count().sort_values(by=["Rank"],ascending=False)
x_data=country_group.index.tolist()[:7]
y_data=country_group["Rank"].values.tolist()[:7]
b=(Bar()
.add_xaxis(x_data)
.add_yaxis('',y_data)
.set_global_opts(title_opts=opts.TitleOpts(),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='top'))
)
grid=Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b,grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
c=(
Scatter()
.add_xaxis(x_data)
.add_yaxis("",y_data)
.set_global_opts(
title_opts=opts.TitleOpts(),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
visualmap_opts=opts.VisualMapOpts(type_="size",max_=30,min_=1),
)
)
grid=Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(c,grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
type_group=df.groupby("Type").count().sort_values(by=["Rank"],ascending=False)
x_type=type_group.index.tolist()
y_type=type_group["Rank"].values.tolist()
test=['SocialNetworksandOnlineCommunities',
'Marketplace',
'NewsandMedia',
'SearchEngines',
'Adult',
'ProgrammingandDeveloperSoftware',
'Email']
c=(
Polar()
.add_schema(angleaxis_opts=opts.AngleAxisOpts(data=x_type[:9],type_="category"))
.add("",y_type[:9],type_="bar",stack="stack0")
.set_global_opts(title_opts=opts.TitleOpts(title=""))
)
grid=Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(c,grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
l1=(
Liquid()
.add("",[y_type[2]/sum(y_type)],center=["35%","75%"])
.set_global_opts(title_opts=opts.TitleOpts(title=""))
)
l2=Liquid().add(
"lq",
[y_type[0]/sum(y_type)],
center=["25%","26%"],
label_opts=opts.LabelOpts(
font_size=50,
formatter=JsCode(
"""function(param){
return(Math.floor(param.value*10000)/100)+'%';
}"""
),
position="inside",
),
)
l3=(
Liquid()
.add("",[y_type[1]/sum(y_type)],center=["75%","26%"])
.set_global_opts(title_opts=opts.TitleOpts(title=""))
)
l4=Liquid().add(
"",
[y_type[3]/sum(y_type)],
center=["65%","75%"],
label_opts=opts.LabelOpts(
font_size=50,
formatter=JsCode(
"""function(param){
return(Math.floor(param.value*10000)/100)+'%';
}"""
),
position="inside",is_show=True
),
)
grid=Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE)).add(l1,grid_opts=opts.GridOpts()).add(l2,grid_opts=opts.GridOpts()).add(l3,grid_opts=opts.GridOpts()).add(l4,grid_opts=opts.GridOpts())
grid.render_notebook()
动态排行展示
分享
点收藏
点点赞
点在看
文章转发自AI科技大本营微信公众号,版权归其所有。文章内容不代表本站立场和任何投资暗示。
Copyright © 2021.Company 元宇宙YITB.COM All rights reserved.元宇宙YITB.COM