版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/weixin_40313634/article/details/96611824
爬取过程中的问题解决
环境 :Windows + Python 3.7 + selenium
1. class 中含有空格,如何定位
<div class="u-cover u-cover-1">
browser.find_elements_by_css_selector('[class = "u-cover u-cover-1"]')
2. 定位信息好着,就是获取不到元素
原因:
selenium 打开网页后, 默认是在父级 Frame 里, 直接搜索是搜不到子 Frame 里的信息的。 需要切换 Frame。
方法:
switch_to.frame('frameid')
switch_to.parent_frame()
- 类似问题
原因:点击后,浏览器新打开了个选项卡。没有切换过来,导致还是在旧的选项卡里查找
解决:
browser.switch_to_window(browser.window_handles[1])
- 例子如下
<iframe name="contentFrame" id="g_iframe" class="g-iframe" scrolling="auto" frameborder="0" src="about:blank" allowfullscreen="true"></iframe>
browser.switch_to.frame('g_iframe')
3. find_elements后点击不了抓取的元素
- 问题: for 循环执行第二次时报错如下:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
- 代码
we_gedans = self.browser.find_elements_by_css_selector('[class = "u-cover u-cover-1"]')
for we_gedan in we_gedans: we_gedan.click()</code></pre></div></div><ul class="ul-level-0"><li>原因</li></ul><div class="rno-markdown-code"><div class="rno-markdown-code-toolbar"><div class="rno-markdown-code-toolbar-info"><div class="rno-markdown-code-toolbar-item is-type"><span class="is-m-hidden">代码语言:</span>javascript</div></div><div class="rno-markdown-code-toolbar-opt"><div class="rno-markdown-code-toolbar-copy"><i class="icon-copy"></i><span class="is-m-hidden">复制</span></div></div></div><div class="developer-code-block"><pre class="prism-token token line-numbers language-javascript"><code class="language-javascript" style="margin-left:0">官方给出解释如下:
The element has been deleted entirely.
The element is no longer attached to the DOM.
就是页面元素过期,引用的元素过时,不再依附于当前页面,需要重新定位获取元素对象
find_elements 查找到的是 WebElement 类型的数组数据,含有元素在当前页面的地址信息,调用 click() 方法就是使用了此地址信息。
切换页面后,此地址信息就失效了。所以不能在直接在循环中调用 click()
4. eyed3 pip 安装成功, import报错
- 报错如下
import magic
File "C:\Users\zuoy\AppData\Local\Programs\Python\Python37\lib\site-packages\magic.py", line 181, in <module>
raise ImportError('failed to find libmagic. Check your installation')
ImportError: failed to find libmagic. Check your installation
- 原因: eyed3 依赖 magic,必须安装上这个才能使用
pip install pip install python-magic-bin
pip install eyed3
代码
#!/usr/bin/env python
'''
功能:访问网易云音乐网站,下载歌单里的所有免费歌曲
时间:2019/07/20
'''from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWaitimport requests
import eyed3 # 给下载的MP3歌曲添加属性信息:专辑名、歌手等import time
import json
import osclass Splider(object):
def init(self):
self.__url = 'https://music.163.com/'
self.browser = webdriver.Chrome()def __get_gedans(self, url=None): gedans = [] self.browser.get(self.__url) self.browser.switch_to.frame('g_iframe') we_gedans = self.browser.find_elements_by_css_selector('[class = "u-cover u-cover-1"]') for we_gedan in we_gedans: gedan = {} gedan = { 'title': we_gedan.find_element_by_css_selector('a').get_attribute('title'), 'url': we_gedan.find_element_by_css_selector('a').get_attribute('href'), 'image': we_gedan.find_element_by_css_selector('img').get_attribute('src'), 'click': we_gedan.find_element_by_css_selector('.nb').text } gedans.append(gedan) return gedans def __get_songs(self, gedan): urls = [] self.browser.get(gedan['url']) self.browser.switch_to.frame('g_iframe') we_songs = self.browser.find_elements_by_css_selector('[class = "even "]') for we_song in we_songs: url = { 'id': we_song.find_element_by_css_selector('.left .hd span').get_attribute('data-res-id'), 'name': we_song.find_element_by_css_selector('.f-cb b').get_attribute('title'), 'songer': we_song.find_elements_by_css_selector('.text')[0].get_attribute('title'), 'cd': we_song.find_elements_by_css_selector('.text')[1].find_element_by_css_selector('a').get_attribute('title') } # 清理掉其中的非显示字符 for key, value in url.items(): url[key] = ' '.join(value.split()) urls.append(url) return urls def __download_song(self, song): base_url = "http://music.163.com/song/media/outer/url?id={0}" url = base_url.format(song['id']) re = requests.get(url) if re.status_code == 200: filename = song['name'] + '.mp3' with open(filename, 'wb') as f: f.write(re.content) at_song = eyed3.load(filename) at_song.tag.artist = song['songer'] at_song.tag.album = song['cd'] at_song.tag.title = song['name'] at_song.tag.save() return def __save_json(self, strs, filename): filename = filename if os.path.splitext(filename)[1] == '.json' else os.path.splitext(filename)[0] + '.json' with open(filename, 'w', encoding='utf-8') as f: f.write(json.dumps(strs, indent=4, ensure_ascii=False)) def __read_json(self, filename): data = [] filename = filename if os.path.splitext(filename)[1] == '.json' else os.path.splitext(filename)[0] + '.json' with open(filename, 'r', encoding='utf-8') as f: data = json.loads(f.read()) return data def run(self): gedans = self.__get_gedans() self.__save_json(gedans, 'gedans.json') for gedan in gedans: try: songs = self.__get_songs(gedan) self.__save_json(songs, 'songs.json') except Exception as e: print(e.args) os.chdir('songs') for song in songs: try: self.__download_song(song) except Exception as e: print(e.args) os.chdir('..') self.browser.close()
if name == 'main':
splider = Splider()
splider.run()