coub.com 内容抓取

2018-09-26 19:00:00

抓取说明

1、总共17个分类。

2、数据获取

  • url:https://coub.com/api/v2/timeline/hot/movies/half?per_page=25
  • 说明:movies 为分类。 per_page 为每页返回的数据量[1,25]。首次获取只需传入 page=1 即为第一页的数据。下次请求附带字段 anchor 为上次请求返回的 next 参数即可。

3、每个资源的属性:

  • 唯一标志: id、permalink
  • 资源描述: titile

4、下载

coub.com的音频和视频是分开的,下载的时候需要将音视频分别下载,然后使用FFmpeg合并。
下载及合并使用开源项目 https://github.com/TeeSeal/coub-dl

5、分类数组

["animals-pets", "mashup", "anime", "movies", "gaming", "cartoons", "art", "music", "sports", "science-technology", "celebrity", "nature-travel", "fashion", "dance", "cars", "nsfw"]

核心代码:

/**
 * 获取视频列表,每次请求返回10个视频
 * @param {number} page 请求的页数
 * @param {number} anchor 保证数据的不重复性
 */
function getCoubVideoList(c, page = 1, anchor, next) {
    if (!c) {
        log.error('category empty', c)
        return next(new Error('category empty'), null)
    }
    var options = {
        method: 'GET',
        url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`,
        //url: `https://coub.com/api/v2/timeline/hot/${c}/half`,
        qs: {
            page: page,
            per_page: per_page
        }
    };
    if (anchor) options.qs.anchor = anchor

    request(options, function (error, response, body) {
        if (error) {
            next(error, null)
            return
        }
        let data = JSON.parse(body)

        if (data && data.coubs && data.coubs.length) {
            log.info(`获取视频列表成功 page ${page}`, data.next, data.coubs.length)
            //videoList.push(data.data)
            videoList = videoList.concat(data.coubs)
            return next(null, c, ++data.page, data.next)
        } else {
            log.info('获取内容为空 page ${page}')
            return next(null, c, ++data.page, data.next)
        }
    });
}

/**
 * 获取指定分类的总页数
 */
const getTotalPage = (c) => {
    var options = {
        method: 'GET',
        url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`,
        //url: `https://coub.com/api/v2/timeline/hot/${c}/half`,
        qs: {
            page: 1,
            per_page: per_page
        }
    };
    return new Promise((resolve, reject) => {
        request(options, function (error, response, body) {
            if (error) return reject(new Error(error))
            let data = JSON.parse(body)
            if (data && data.total_pages) {
                log.info(`获取${c}总页数成功`, data.total_pages)
                return resolve(data.total_pages)
            } else {
                log.info(`获取${c}总页数失败`)
                return reject(new Error('页数为空'))
            }
        });
    })

}

/**
 * 获取多页的视频
 */
const getMultiVideo = async c => {
    // 总页数
    let totalPage = await getTotalPage(c)
    // 每页依次队列获取
    let actions = [async.constant(c, startPage, startAnchor)]
    for (let i = 1; i <= totalPage; i++) {
        actions.push(getCoubVideoList)
    }
    return new Promise((resolve, reject) => {
        async.waterfall(actions, function (err, result) {
            log.info(`finish crawler ${c} videos`, err, videoList.length)
            if (err) return reject(new Error(err))
            return resolve(videoList)
        })
    })
}

/**
 * 根据视频的permalink下载视频
 * @param {string} id video permalink
 */
async function downloadFile(c, video, next) {
    if (!video || !video.permalink) return next(null, '')
    let id = video.permalink

    let filename = `${dlPath}/${id}.mp4`

    let isExist = isFileExist(id)
    // 文件已存在
    if (isExist) {
        return next(null, filename)
    }

    // 下载操作
    const coub = await Coub.fetch(`http://coub.com/view/${id}`).catch(error => {
        console.log('fetch error', error)
        return next(null, '')
    })
    if (!coub) return next(null, '')
    coub.attachAudio()
    if (fastMode) coub.addOption('-c', 'copy')
    coub.addOption('-shortest')
    let ts = new Date()
    coub.write(filename)
        .then(result => {
            let te = new Date()
            let tu = (te - ts) / 1000
            log.info(`${downloadCount}:finish download ${c} ${id}.mp4`, filename, `用时${tu}s`)
            downloadCount++
            // 视频信息
            let videoInfo = {
                desc: video.title,
                category: c,
                filename: `${id}.mp4`
            }
            // 实时写入json
            saveJsonData(videoInfo)

            dlFilesJson.push(videoInfo)
            return next(null, result)
            //return resolve(result)
        })
        .catch(error => {
            log.error(`download error ${id}.mp4`, error)
            return next(error, '')
            //return reject(error)
        })
}

/**
 * 视频是否已下载
 */
const isFileExist = id => {
    let oldPath = path.resolve(__dirname, `./src/video/${id}.mp4`);
    let newPath = path.resolve(__dirname, `./downloads/video/${id}.mp4`);
    let weeklyPath = path.resolve(__dirname, `./weekly/video/${id}.mp4`);
    let monthlyPath = path.resolve(__dirname, `./monthly/video/${id}.mp4`);
    let quarterPath = path.resolve(__dirname, `./quarter/video/${id}.mp4`);
    let halfPath = path.resolve(__dirname, `./half/video/${id}.mp4`);

    if (fs.existsSync(oldPath)) {
        log.info('file exist', oldPath)
        return true
    } else if (fs.existsSync(newPath)) {
        log.info('file exist', newPath)
        return true
    } else if(fs.existsSync(weeklyPath)){
        log.info('file exist', weeklyPath)
        return true
    } else if(fs.existsSync(monthlyPath)){
        log.info('file exist', monthlyPath)
        return true
    } else if(fs.existsSync(quarterPath)){
        log.info('file exist', quarterPath)
        return true
    } else if(fs.existsSync(halfPath)){
        log.info('file exist', halfPath)
        return true
    } else return false
}

/**
 * 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
 * @param {*} data 
 */
const saveJsonData = data => {
    try {

        // 读取已有json信息
        let jsonFile = `${jsonPath}/all.json`

        let jsonData = []
        if (fs.existsSync(jsonFile)) {
            fileData = fs.readFileSync(jsonFile, {
                encoding: 'utf8'
            })
            if (fileData) {
                jsonData = JSON.parse(fileData)
            }
        }
        // 写入
        jsonData.push(data)
        fs.writeFileSync(jsonFile, JSON.stringify(jsonData));

    } catch (error) {
        log.error('写入json文件失败', data)
    }

}


/**
 * 使用-C模式,将视频与音频快速合并,速度快,但问题视频较多,视频声音不正常。
 * 使用非-C模式,速度较慢,且由于合并时占用cpu较大,多个视频合并任务同时进行时,电脑基本会卡死
 * 最终采用非-C模式,保证每个视频的音频正常。同时为保证电脑不死机,以队列模式依次处理。唯一缺陷是耗时。
 */
async function doDownload(c) {
    let result = await getMultiVideo(c)
    videoList = []
    let data = []
    result.forEach(item => data = data.concat(item))
    log.info(`要抓取的 ${c} 类型的视频总数为 ${data.length} 个`)

    let actions = data.map(video => next => {
        downloadFile(c, video, next)
    })

    return new Promise((resolve, reject) => {
        let st = new Date()
        async.series(actions, function (err, result) {
            let et = new Date()
            let ut = timeUsed((et - st) / 1000)
            log.info(`finish download ${c} video, 耗时 ${ut}`, err, result.length)

            if (err) return reject(new Error(err))
            // 每个分类的json
            fs.writeFileSync(`${jsonPath}/${c}.json`, JSON.stringify(dlFilesJson));
            dlFilesJson = []
            downloadCount = 1
            return resolve(result)
        })
    })

}

async function main() {

    let animals_pets = await doDownload('animals-pets')
    let mashup = await doDownload('mashup')
    let anime = await doDownload('anime')
    let movies = await doDownload('movies')
    let gaming = await doDownload('gaming')
    let cartoons = await doDownload('cartoons')
    let art = await doDownload('art')
    let music = await doDownload('music')
    let news  = await doDownload('news')
    let sports = await doDownload('sports')
    let science_technology = await doDownload('science-technology')
    let celebrity = await doDownload('celebrity')
    let nature_travel = await doDownload('nature-travel')
    let fashion = await doDownload('fashion')
    let dance = await doDownload('dance')
    let cars = await doDownload('cars')
    let nsfw = await doDownload('nsfw')
    
    return true
}

/**
 * 用时显示
 */
const timeUsed = t => {
    // [1s, 1m)
    if (t < 60) return `${Math.round(t)}s`
    // [1m, 1h)
    else if (t >= 60 && t < 60 * 60) return `${Math.floor(t/60)}m${Math.floor(t%60)}s`
    // [1h, 1d)
    else if (t >= 60 * 60 && t < 60 * 60 * 24) return `${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m`
    // [1d, ~)
    else return `${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h`
}

main()
    .then(result => {
        let endTime = new Date()
        let usedTime = timeUsed((endTime - startTime) / 1000)
        log.info(`all downloads finish,${result} 个视频,共耗时 ${usedTime}`, )
    })
    .catch(error => {
        log.error('download error', error)
    })
    .then(() => {
        process.exit(0)
    })

process.on('uncaughtException', err => {
    log.info(err)
    log.info(JSON.stringify(dlFilesJson))
})

完整代码: https://github.com/flute/coub-crawler

9GAG.com 内容抓取

抓取说明1、总共52个分类。2、数据获取url:https://9gag.com/v1/group-posts/group/cute/type/hot?c=10说明:cute 为分类。首次获取只需传入 c=10 即为前十条数据。下次请求附带上次请求返回的 nextCursor 参数即可。每次请求返回10条数据。3、每个资源的属性:唯一标志: id资源描述: titile4、资源分三种类型,根据images属性下的字段区分 image  属性:image460    image700   gif  属性:image460    image460sv  image460svwm    image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为0,及为无声,即为GIF

instagram 内容抓取

抓取说明1、需要登录信息,即抓取时需要附带cookie,同时需要user-agent。2、数据获取接口及下载均有频率限制,无间隔的请求(几百个资源)会被限制,在被限制后睡眠一定时间继续。3、内容抓取分为两个入口一个是抓取某个用户发布的所有资源一个是抓取某个tag下的所有资源两种入口附带的cookie不同,请求的URL不同。4、抓取步骤:电脑端登陆ins,保存 cookie、query_hash、user-agent信息。后续所有请求附带cookie及user-agent。模拟请求个人主页/tag主页,通过解析HTML页面,得到userId/tag name。同时拿到第一页的数据及下页cursor。通过API接口,根据cursor持续获取多页数据。所有数据获取完毕后开始下载。返回的数据中,图片资源可以直接下载。视频资源需要再次请求视频地址获取接口获得视频地址,然后再下载。5、请求数据接口:user:https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=