Commit 0bc83173 authored by Matthew Cooper's avatar Matthew Cooper

iPything notebook for filtering out CC files

parent 9b4f6fb4
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from shutil import copyfile\n",
"\n",
"os.chdir(\"/nfs/textdataclimateshocks-data/raw/webhose/keywords/catch-webhose/\")\n",
"\n",
"catch_webhose = os.listdir(\".\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"5000\n",
"10000\n",
"15000\n",
"20000\n",
"25000\n",
"30000\n",
"35000\n",
"40000\n",
"45000\n",
"50000\n",
"55000\n",
"60000\n",
"65000\n",
"70000\n",
"75000\n",
"80000\n",
"85000\n",
"90000\n"
]
}
],
"source": [
"l = len(catch_webhose)\n",
"for i in range(l):\n",
" if i % 5000 == 0:\n",
" print(i)\n",
" try:\n",
" f = open(catch_webhose[i])\n",
" txt = f.read().lower()\n",
" if 'climate change' in txt or 'global warming' in txt:\n",
" copyfile(catch_webhose[i], '/nfs/textdataclimateshocks-data/raw/webhose/climatechange/' + catch_webhose[i])\n",
" \n",
" except:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"os.chdir(\"/nfs/textdataclimateshocks-data/raw/webhose/keywords/2018-02/\")\n",
"\n",
"feb = os.listdir(\".\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"5000\n",
"10000\n",
"15000\n",
"20000\n",
"25000\n",
"30000\n",
"35000\n",
"40000\n",
"45000\n",
"50000\n",
"55000\n",
"60000\n",
"65000\n",
"70000\n",
"75000\n",
"80000\n",
"85000\n",
"90000\n",
"95000\n",
"100000\n",
"105000\n",
"110000\n",
"115000\n"
]
}
],
"source": [
"l = len(feb)\n",
"for i in range(l):\n",
" if i % 5000 == 0:\n",
" print(i)\n",
" try:\n",
" f = open(feb[i])\n",
" txt = f.read().lower()\n",
" if 'climate change' in txt or 'global warming' in txt:\n",
" copyfile(feb[i], '/nfs/textdataclimateshocks-data/raw/webhose/climatechange/' + feb[i])\n",
" \n",
" except:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.chdir(\"/nfs/textdataclimateshocks-data/raw/webhose/random/\")\n",
"\n",
"rand = os.listdir(\".\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"l = len(rand)\n",
"for i in range(l):\n",
" #Should be about 11 million total\n",
" if i % 100000 == 0:\n",
" print(i)\n",
" try:\n",
" f = open(rand[i])\n",
" txt = f.read().lower()\n",
" if 'climate change' in txt or 'global warming' in txt:\n",
" copyfile(rand[i], '/nfs/textdataclimateshocks-data/raw/webhose/climatechange/' + rand[i])\n",
" \n",
" except:\n",
" continue"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment