instagram 内容抓取

2018-09-27 19:00:00

抓取说明

1、需要登录信息,即抓取时需要附带cookie,同时需要user-agent

2、数据获取接口及下载均有频率限制,无间隔的请求(几百个资源)会被限制,在被限制后睡眠一定时间继续。

3、内容抓取分为两个入口

  • 一个是抓取某个用户发布的所有资源
  • 一个是抓取某个tag下的所有资源

两种入口附带的cookie不同,请求的URL不同。

4、抓取步骤:

  1. 电脑端登陆ins,保存 cookiequery_hashuser-agent信息。后续所有请求附带cookieuser-agent
  2. 模拟请求个人主页/tag主页,通过解析HTML页面,得到userId/tag name。同时拿到第一页的数据及下页cursor。
  3. 通过API接口,根据cursor持续获取多页数据。所有数据获取完毕后开始下载。
  4. 返回的数据中,图片资源可以直接下载。视频资源需要再次请求视频地址获取接口获得视频地址,然后再下载。

5、请求数据接口:

user:

https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D

tag:

https://www.instagram.com/graphql/query/?query_hash=1780c1b186e2c37de9f7da95ce41bb67&variables=%7B%22tag_name%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D

获取视频的地址:

https://www.instagram.com/p/%s/?__a=1

核心代码

/**
 * 获取指定用户的主页
 */
const getHtml = item => {
    let userName = item.name,
        type = item.type
    let url
    if (item.type == 'user') {
        url = `${baseUrl}${userName}/`
        headers.cookie = userCookie
    } else {
        url = `${baseUrl}explore/tags/${userName}/`
        headers.cookie = tagCookie
    }
    let options = {
        method: 'GET',
        url: url,
        headers: headers
    }

    return new Promise((resolve, reject) => {
        request(options, function (error, response, body) {
            if (error) return reject(error);

            const $ = cheerio.load(body)
            let html = $.html()

            // 获取uid/tag name
            userId = item.type == 'user' ? html.match(/"profilePage_([0-9]+)"/)[1] : html.match(/"name":"([a-zA-Z_]+)",/)[1]
            log.info(`${userName} id/name 获取成功 ${userId}`)

            // 获取首页数据
            data = html.match(/<script type="text\/javascript">window._sharedData = (.*?);<\/script>/)[1]
            data = JSON.parse(data)

            let edges, count, pageInfo, cursor, flag, totalPage

            let firstPageDate

            if (item.type == 'user') {
                firstPageDate = data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media
            } else {
                firstPageDate = data.entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media
            }

            edges = firstPageDate.edges
            count = firstPageDate.count
            pageInfo = firstPageDate.page_info

            cursor = pageInfo.end_cursor
            flag = pageInfo.has_next_page
            totalPage = Math.ceil(count / purePage)

            // 存储首页信息
            edges.forEach(item => {
                item.mode = type
                storeMedia(item)
            })

            // 返回分页信息
            return resolve({
                totalPage: totalPage,
                userId: userId,
                cursor: cursor
            })

        });
    })

}


/**
 * 获取该用户的所有内容
 */
const getAllUrls = (item, totalPage, uid, cursor) => {
    let userName = item.name
    let actions = [async.constant(item, uid, cursor)]
    let limit = totalPage > pageLimit ? pageLimit : totalPage
    for (let i = 0; i < limit; i++) {
        actions.push(fetchData)
    }
    log.info(`${userName} 数据共 ${totalPage} 页`)
    return new Promise((resolve, reject) => {
        async.waterfall(actions, (error, result) => {
            log.info(`${userName} 的所有帖子数据获取成功,共${media.length}个帖子,视频${videoCount}个,图片${imgCount}个`, )
            fetchPageCount = 0
            //console.log(media)
            return resolve(media)
        })
    })

}

/**
 * 请求获取数据
 */
const fetchData = (item, uid, offset, next) => {

    let userName = item.name,
        type = item.type
    let url

    if (item.type == 'user') {
        url = util.format(fetchUserUrl, uid, offset)
        headers.cookie = userCookie
    } else {
        url = util.format(fetchTagUrl, uid, offset)
        headers.cookie = tagCookie
    }

    let options = {
        method: 'GET',
        url: url,
        headers: headers
    };

    request(options, function (error, response, body) {
        if (error) {
            log.error('fetch data error', error)
            log.info('休息1min~')
            return setTimeout(function () {
                return next(null, item, uid, offset)
            }, 1 * 60 * 1000)
        }

        let data
        try {
            data = JSON.parse(body)
        } catch (error) {
            log.error('json序列化失败', error)
            return next(null, item, uid, offset) 
        }
        
        if (data.status == 'fail') {
            log.error('返回内容失败', data)
            log.info('休息1min~')
            //return next(data.message)
            return setTimeout(function () {
                return next(null, item, uid, offset)
            }, 1 * 60 * 1000)
        }

        let listData
        try {
            if (item.type == 'user') {
                listData = data.data.user.edge_owner_to_timeline_media
            } else {
                listData = data.data.hashtag.edge_hashtag_to_media
            }
        } catch (error) {
            log.error('数据获取失败', error)
            next(error)
        }

        let edges = listData.edges
        edges.forEach(item => {
            item.mode = type
            storeMedia(item)
        })
        let {
            has_next_page,
            end_cursor
        } = listData.page_info

        log.info(`page:${++fetchPageCount} ${userName} 数据获取成功,帖子 ${edges.length} 个, has_next_page: ${has_next_page} ,end_cursor: ${end_cursor}`)

        if (!has_next_page) {
            return next('所有数据获取完毕,无下页')
        }
        setTimeout(function () {
            return next(null, item, uid, end_cursor)
        }, 2000)

    });

}

/**
 * 根据视频的shortcode获取视频的下载地址
 */
const fetchVideoUrl = (mode, shortcode) => {
    let url = util.format(getVideoUrl, shortcode)

    if (mode == 'user') {
        headers.cookie = userCookie
    } else {
        headers.cookie = tagCookie
    }
    let options = {
        method: 'GET',
        url: url,
        headers: headers
    }
    return new Promise((resolve, reject) => {
        request(options, function (error, response, body) {
            let videoUrl = ''
            if (error) {
                log.error(`获取 ${shortcode} 视频地址失败`, error)
                return resolve(videoUrl)
            }

            try {
                let data = JSON.parse(body)
                videoUrl = data.graphql.shortcode_media.video_url
            } catch (error) {
                log.error(`获取 ${shortcode} videoUrl 为空`)
            }
            return resolve(videoUrl)
        })
    })

}

/**
 * 根据不同的类型存储数据
 */
const storeMedia = async item => {
    let result = {
        id: item.node.id,
        desc: item.node.edge_media_to_caption.edges[0] ? item.node.edge_media_to_caption.edges[0].node.text : ''
    }
    if (item.node.is_video) {
        // video
        // 如果有video_url直接获取
        // 如果没有video_url,通过接口获取
        let videoUrl = item.node.video_url
        if (!videoUrl) videoUrl = await fetchVideoUrl(item.mode, item.node.shortcode)
        if (videoUrl) {
            result.type = 'video'
            result.url = videoUrl
            videoCount++
        }
    } else {
        // img
        let imgUrl = item.node.display_url
        if (imgUrl) {
            result.type = 'img'
            result.url = imgUrl
            imgCount++
        }
    }
    media.push(result)
}

/**
 * 下载视频/图片
 */
const download = (category, media, next) => {

    let isExist = isFileExist(media.id)
    if (isExist) return next(null)

    let filePath
    if (media.type == 'video') {
        filePath = `${videoDlPath}/${media.id}.mp4`
    } else if (media.type == 'img') {
        filePath = `${imgDlPath}/${media.id}.jpg`
    } else return next(null)

    let st = new Date()
    request(media.url)
        .on('response', function (res) {
            // create file write stream
            let fws = fs.createWriteStream(filePath);
            // setup piping
            res.pipe(fws);
            // finish
            res.on('end', function (e) {
                let et = new Date()
                let ut = timeUsed((et - st) / 1000)
                log.info(`${videoDl + imgDl} finish download ${category} ${filePath},用时${ut}`)
                saveJsonData(media.type, {
                    id: media.id,
                    category: category,
                    desc: media.desc
                })
                if (media.type == 'video') videoDl++
                else imgDl++

                return next(null)
            });
            // error handler
            res.on('error', err => {
                log.error('download error', err)
                return next(null)
            })
        })
        .on('error', function (err) {
            log.error('request source failed', media.url, err)
            // 大约3分钟可恢复
            log.info('超频啦!休息1分钟~')
            setTimeout(function () {
                return next(null)
            }, 1 * 60 * 1000)

        })

}

/**
 * 视频是否已下载
 */
const isFileExist = id => {
    let videoPath = `${videoDlPath}/${id}.mp4`
    let imgPath = `${imgDlPath}/${id}.jpg`
    if (fs.existsSync(videoPath)) {
        log.info('video file exist', videoPath)
        return true
    } else if (fs.existsSync(imgPath)) {
        log.info('img file exist', imgPath)
        return true
    } else return false
}

/**
 * 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
 */
const saveJsonData = (type, data) => {
    try {
        // 读取已有json信息
        let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath
        jsonFile += `/data.json`

        let jsonData = []
        if (fs.existsSync(jsonFile)) {
            fileData = fs.readFileSync(jsonFile, {
                encoding: 'utf8'
            })
            if (fileData) {
                jsonData = JSON.parse(fileData)
            }
        }
        // 写入
        jsonData.push(data)
        fs.writeFileSync(jsonFile, JSON.stringify(jsonData));

    } catch (error) {
        log.error('写入json文件失败', data)
    }

}

const clearData = () => {
    media = []
    videoCount = 0
    imgCount = 0
    videoDl = 0
    imgDl = 0
}

/**
 * 下载某用户/标签下获取的所有资源
 */
const downloadAll = (userName, data) => {
    let dlActions = data.map(item => next => {
        download(userName, item, next)
    })
    return new Promise((resolve, reject) => {
        async.series(dlActions, (error, result) => {
            return resolve(result)
        })
    })
}


/**
 * 用时显示
 */
const timeUsed = t => {
    // [1s, 1m)
    if (t < 60) return `${Math.ceil(t)}s`
    // [1m, 1h)
    else if (t >= 60 && t < 60 * 60) return `${Math.floor(t/60)}m${Math.floor(t%60)}s`
    // [1h, 1d)
    else if (t >= 60 * 60 && t < 60 * 60 * 24) return `${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m`
    // [1d, ~)
    else return `${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h`
}

/**
 * 某个用户/标签的抓取任务
 */
const task = async (item, next) => {
    let userName = item.name

    let {
        totalPage,
        userId,
        cursor
    } = await getHtml(item).catch(error => {
        log.error('fetch error', error)
        return next(null)
    })

    let data = await getAllUrls(item, totalPage, userId, cursor)

    clearData()

    let st = new Date()
    let download = await downloadAll(userName, data)
    let et = new Date()
    let ut = timeUsed((et - st) / 1000)
    log.info(`${userName} 所有下载完成, video ${videoDl} 个,img ${imgDl} 个,共用时 ${ut}`)
    clearData()
    return next(null)

}

const main = () => {
    let actions = target.map(item => next => {
        task(item, next)
    })
    async.series(actions, (error, result) => {
        log.info(`所有 ${result.length} 个任务完成`, error)
        process.exit(0)
    })
}

main()

完整代码: https://github.com/flute/instagram-crawler

coub.com 内容抓取

抓取说明1、总共17个分类。2、数据获取url:https://coub.com/api/v2/timeline/hot/movies/half?per_page=25说明:movies 为分类。 per_page 为每页返回的数据量[1,25]。首次获取只需传入 page=1 即为第一页的数据。下次请求附带字段 anchor 为上次请求返回的 next 参数即可。3、每个资源的属性:唯一标志: id、permalink资源描述: titile4、下载coub.com的音频和视频是分开的,下载的时候需要将音视频分别下载,然后使用FFmpeg合并。下载及合并使用开源项目 https://github.com/TeeSeal/coub-dl5、分类数组["animals-pets"

Mac Photoshop 2018(Adobe全家桶)下载破解

请支持正版软件!!!适用于Adobe所有软件的安装破解,以 Photoshop 2018 为例。根据电脑语言为中文/英文,安装完的软件自动为中文/英文。1、下载安装 Adobe creative cloud下载地址:https://www.adobe.com/creativecloud/desktop-app.html可以理解为Adobe的下载器,可以使用这个下载器下载所有Adobe产品。下载完成后,选择登陆/注册Adobe账户,然后你就可以看到如下页面:而事实上,从官网介绍来看,我们需要的是这样的(Apps页面可下载Adobe的产品):猜测可能是因为注册的账户是天朝或者所在地为天朝的原因,好多功能被屏蔽了,大天朝还真是一枝独秀,且看我手势 🖕解决方法:macOS修改 Adobe creative cloud 的配置文件:false 改为 true:sudo vi /Library/Application Support/Adobe/