9GAG.com 内容抓取

2018-09-25 19:39:02

抓取说明

1、总共52个分类。

2、数据获取

  • url:https://9gag.com/v1/group-posts/group/cute/type/hot?c=10
  • 说明:cute 为分类。首次获取只需传入 c=10 即为前十条数据。下次请求附带上次请求返回的 nextCursor 参数即可。每次请求返回10条数据。

3、每个资源的属性:

  • 唯一标志: id
  • 资源描述: titile

4、资源分三种类型,根据images属性下的字段区分

  1. image  属性:image460    image700  
  2. gif  属性:image460    image460sv  image460svwm    image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为0,及为无声,即为GIF  
  3. video  属性:image460    image460sv  image460svwm   image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为1,及为有声,即为video  

5、内容字段

image460 : {
    height: 258
    url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460s.jpg"
    webpUrl: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460swp.webp"
    width: 460
}

image460sv: {
    duration: 32
    h265Url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svh265.mp4"
    hasAudio: 1
    height: 258
    url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460sv.mp4"
    vp9Url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svvp9.webm"
    width: 460
}

image460svwm : {
    duration: 32
    hasAudio: 1
    height: 258
    url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svwm.webm"
    width: 460
}

image700 : {
    height: 258
    url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460s.jpg"
    width: 460
}

6、分类数组

[
    'funny',
    'cute',
    'anime-manga',
    'ask9gag',
    'awesome',
    'basketball',
    'car',
    'comic',
    'cosplay',
    'country',
    'classicalartmemes',
    'imadedis',
    'drawing',
    'animefanart',
    'food',
    'football',
    'fortnite',
    'gaming',
    'gif',
    'girl',
    'girly',
    'guy',
    'history',
    'horror',
    'home',
    'kpop',
    'leagueoflegends',
    'lego',
    'movie-tv',
    'music',
    'overwatch',
    'pcmr',
    'photography',
    'pokemon',
    'politics',
    'relationship',
    'pubg',
    'roastme',
    'savage',
    'starwars',
    'satisfying',
    'school',
    'science',
    'superhero',
    'surrealmemes',
    'sport',
    'travel',
    'timely',
    'video',
    'warhammer',
    'wallpaper',
    'wtf'
]

核心代码:

/**
 * 获取内容
 */
const get9gagList = (category, offset, next) => {

    var options = {
        method: 'GET',
        url: `https://9gag.com/v1/group-posts/group/${category}/type/${type}?`
    };

    if (offset == '') {
        options.url += 'c=10'
    } else if (offset == -1) {
        return next('complete')
    } else {
        options.url += offset
    }

    request(options, function (error, response, body) {
        if (error) {
            next(error, null)
            return
        }
        let data = JSON.parse(body)

        if (data && data.data && data.data.posts && data.data.posts.length) {
            log.info(`获取 ${category} 视频列表成功 offset ${offset? offset: 'c=10'}`, data.data.posts.length)
            //videoList.push(data.data)
            videoList = videoList.concat(data.data.posts)
            return next(null, category, data.data.nextCursor)
        } else {
            log.info(`获取 ${category} 内容为空 offset ${offset},所有数据获取完毕 。`)
            return next(null, category, -1)
        }
    });

}

/**
 * 批量获取内容列表
 */
const getMultiList = async category => {
    // 每页依次队列获取
    let actions = [async.constant(category, '')]
    for (let i = 1; i <= pageCount; i++) {
        actions.push(get9gagList)
    }
    return new Promise((resolve, reject) => {
        async.waterfall(actions, function (err, result) {
            log.info(`finish crawler ${category} videos`, err, videoList.length)
            //if (err) return reject(new Error(err))
            if (err) log.info(err)
            return resolve(videoList)
        })
    })
}

/**
 * 下载视频/图片
 */
const download = (category, media, next) => {
    //return new Promise((resolve, reject) => {
    let isExist = isFileExist(media.id)
    if (isExist) return next(null)

    let filePath
    if (media.type == 'video') {
        filePath = `${videoDlPath}/${media.id}.mp4`
    } else if (media.type == 'img') {
        filePath = `${imgDlPath}/${media.id}.jpg`
    } else return next(null)

    request(media.url)
        .on('response', function (res) {
            // create file write stream
            var fws = fs.createWriteStream(filePath);
            // setup piping
            res.pipe(fws);
            // finish
            res.on('end', function (e) {
                log.info(`finish download ${category} ${filePath}`)
                saveJsonData(media.type, {
                    id: media.id,
                    category: category,
                    desc: media.desc
                })
                if (media.type == 'video') videoAmount++
                else imgAmount++

                //return resolve(filePath)
                return next(null)
            });
            // error handler
            res.on('error', err => {
                log.error('download error', err)
                //return reject(err)
                return next(null)
            })
        });
    //})
}

/**
 * 视频是否已下载
 */
const isFileExist = id => {
    let videoPath = `${videoDlPath}/${id}.mp4`
    let imgPath = `${imgDlPath}/${id}.jpg`
    if (fs.existsSync(videoPath)) {
        log.info('video file exist', videoPath)
        return true
    } else if (fs.existsSync(imgPath)) {
        log.info('img file exist', imgPath)
        return true
    } else return false
}

/**
 * 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
 */
const saveJsonData = (type, data) => {
    try {
        // 读取已有json信息
        let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath
        jsonFile += `/data.json`

        let jsonData = []
        if (fs.existsSync(jsonFile)) {
            fileData = fs.readFileSync(jsonFile, {
                encoding: 'utf8'
            })
            if (fileData) {
                jsonData = JSON.parse(fileData)
            }
        }
        // 写入
        jsonData.push(data)
        fs.writeFileSync(jsonFile, JSON.stringify(jsonData));

    } catch (error) {
        log.error('写入json文件失败', data)
    }

}

/**
 * 将无声MP4转为gif图
 */
const convertVideoToGift = () => {
    let videoPath = './233.mp4'
    var command = ffmpeg(videoPath)
        .format('gif');
    command.save('./233.gif');
}

/**
 * 内容筛选,只下载有声视频
 */
const mediaFilter = data => {
    let results = [],
        videos = [],
        imgs = []
    for (let i = 0; i < data.length; i++) {
        let video = data[i]
        if (video.images.image460sv && video.images.image460sv.hasAudio && video.images.image460sv.url) {
            // 有声视频
            videos.push({
                id: video.id,
                type: 'video',
                url: video.images.image460sv.url,
                desc: video.title
            })
        } else if (!video.images.image460sv && video.images.image460.url) {
            // 图片
            imgs.push({
                id: video.id,
                type: 'img',
                url: video.images.image460.url,
                desc: video.title
            })
        }
    }
    return {
        results: results.concat(videos, imgs),
        video: videos.length,
        img: imgs.length
    }
}

/**
 * 每个分类的抓取任务
 */
const task = async (category, next) => {
    let videoLists = await getMultiList(category)
    videoList = []
    log.info('数据获取成功', videoLists.length)
    let {
        results: videos,
        video,
        img
    } = mediaFilter(videoLists)
    log.info(`${videoLists.length} 个内容,有声视频共 ${video} 个,图片共 ${img} 个`)

    let dlActions = videos.map(video => next => {
        return download(category, video, next)
    })

    async.series(dlActions, (err, result) => {
        if (err) {
            log.error(`finish【${category}】all download error`, error)
            return next(error)
        }
        log.info(`finish【${category}】all downloads success`, result.filter(item => item).length)
        return next(null)
    })
}

const main = () => {

    let actions = category.map(item => next => {
        return task(item, next)
    })

    return new Promise((resolve, reject) => {
        async.series(actions, function (err, result) {
            if (err) return reject(new Error(err))
            return resolve(result)
        })
    })
}

main()
    .then(result => {
        log.info(`awsome! all ${result.length} tasks finish success! video: ${videoAmount} 个, img: ${imgAmount} 个`, )
    })
    .catch(error => {
        log.info(`all tasks finish error!  video: ${videoAmount}, img: ${imgAmount}`, error)
    })
    .then(() => {
        process.exit(0)
    })

完整代码: https://github.com/flute/9gag-crawler

干锅土豆片+尖椒肉丝

虽然国家一直在推行简化各种手续的办理流程,但是距离像网上购物一样便捷的愿望,真的是还差两个西天取经的路程🙄。不吐槽了,开始主题。忙里偷闲的一天,办完手续中午在家自己整点吃的。冰箱打开只有土豆、洋葱、辣椒、肉丝。那就整两个菜吧,如题。其实应该叫家常土豆片?不过放在干锅里就叫干锅土豆片了...。别问我为什么这么喜欢土豆,因为我种过将近十年土豆😂。非常简单实用的两个小菜。材料土豆洋葱辣椒葱、姜、蒜、干辣椒火锅底料生抽、老抽、盐、胡椒粉、鸡精开搞一、干锅土豆片1、准备食材:食材准备,土豆切片洗净,洋葱切片,辣椒切成丝或者快都行。葱姜蒜切好,少许火锅底料/豆瓣酱。2、炒土豆:开火热锅,锅热后倒少许油。稍许油热后倒入少许火锅底料、干辣椒、葱姜蒜,煸香后倒入土豆翻炒,中途可加入少许食盐、胡椒粉。大约翻炒大约2分钟左右,土豆片已经半熟。倒入洋葱继续翻炒,加入老抽上色。油干的时候边炒边加入少许清水,

coub.com 内容抓取

抓取说明1、总共17个分类。2、数据获取url:https://coub.com/api/v2/timeline/hot/movies/half?per_page=25说明:movies 为分类。 per_page 为每页返回的数据量[1,25]。首次获取只需传入 page=1 即为第一页的数据。下次请求附带字段 anchor 为上次请求返回的 next 参数即可。3、每个资源的属性:唯一标志: id、permalink资源描述: titile4、下载coub.com的音频和视频是分开的,下载的时候需要将音视频分别下载,然后使用FFmpeg合并。下载及合并使用开源项目 https://github.com/TeeSeal/coub-dl5、分类数组["animals-pets"