Skip to content
This repository has been archived by the owner on Jun 10, 2024. It is now read-only.

Commit

Permalink
Merge pull request #955 from binux/fix-test
Browse files Browse the repository at this point in the history
Fix test
  • Loading branch information
binux authored Aug 2, 2020
2 parents 15157ea + 360d131 commit 897891c
Show file tree
Hide file tree
Showing 14 changed files with 71 additions and 189 deletions.
14 changes: 4 additions & 10 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
sudo: required
language: python
cache: pip
python:
- 3.4
- 3.5
- 3.6
- 3.7
Expand All @@ -11,29 +9,25 @@ services:
- docker
- mongodb
- rabbitmq
- redis-server
- redis
- mysql
#- elasticsearch
# - elasticsearch
- postgresql
addons:
postgresql: "9.4"
apt:
packages:
- rabbitmq-server
env:
- IGNORE_COUCHDB=1

before_install:
- echo "deb https://apache.bintray.com/couchdb-deb xenial main" | sudo tee -a /etc/apt/sources.list
- curl -L https://couchdb.apache.org/repo/bintray-pubkey.asc | sudo apt-key add -
- sudo apt-get update -qq
- sudo apt-get install -y couchdb
- sudo systemctl start couchdb
- curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
- npm install express puppeteer
- sudo docker pull scrapinghub/splash
- sudo docker run -d --net=host scrapinghub/splash
before_script:
- curl -X PUT http://127.0.0.1:5984/_users
- curl -X PUT http://127.0.0.1:5984/_replicator
- psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python.

- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...
Expand Down
23 changes: 6 additions & 17 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,15 @@ services:
networks:
- pyspider
command: rabbitmq-server
couchdb:
image: couchdb:latest
container_name: couchdb
mysql:
image: mysql:latest
container_name: mysql
volumes:
- /tmp:/var/lib/mysql
environment:
- COUCHDB_USER=user
- COUCHDB_PASSWORD=password
- MYSQL_ALLOW_EMPTY_PASSWORD=yes
networks:
- pyspider
ports:
- "5984:5984"
# OR we can replace couchdb with mysql
#mysql:
# image: mysql:latest
# container_name: mysql
# volumes:
# - /tmp:/var/lib/mysql
# environment:
# - MYSQL_ALLOW_EMPTY_PASSWORD=yes
# networks:
# - pyspider
phantomjs:
image: pyspider:latest
container_name: phantomjs
Expand Down
22 changes: 2 additions & 20 deletions pyspider/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,26 +214,8 @@ def _connect_couchdb(parsed, dbtype, url):
params = {}

# default to env, then url, then hard coded
params['username'] = os.environ.get('COUCHDB_USER') or parsed.username or 'user'
params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password or 'password'

# create necessary DBs + the admin user
res = requests.put(url + "_users")
if 'error' in res and res['error'] == 'unauthorized':
# user is already created. This will happen if CouchDB is running in docker
# and COUCHDB_USER and COUCHDB_PASSWORD are set
from requests.auth import HTTPBasicAuth
requests.put(url + "_users",
auth=HTTPBasicAuth(params['username'], params['password']))
requests.put(url + "_replicator",
auth=HTTPBasicAuth(params['username'], params['password']))
requests.put(url + '_node/_local/_config/admins/' + params['username'],
data=params['password'],
auth=HTTPBasicAuth(params['username'], params['password']))
else:
requests.put(url + "_replicator")
requests.put(url + '_node/_local/_config/admins/' + params['username'],
data=params['password'])
params['username'] = os.environ.get('COUCHDB_USER') or parsed.username
params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password

if dbtype == 'taskdb':
from .couchdb.taskdb import TaskDB
Expand Down
38 changes: 13 additions & 25 deletions pyspider/database/couchdb/couchdbbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
class SplitTableMixin(object):
UPDATE_PROJECTS_TIME = 10 * 60

def __init__(self):
self.session = requests.session()
if self.username:
self.session.auth = HTTPBasicAuth(self.username, self.password)
self.session.headers.update({'Content-Type': 'application/json'})

def _collection_name(self, project):
if self.collection_prefix:
return "%s_%s" % (self.collection_prefix, project)
Expand Down Expand Up @@ -32,10 +38,7 @@ def _list_project(self):
prefix = ''

url = self.base_url + "_all_dbs"
res = requests.get(url,
data=json.dumps({}),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.get(url, json={}).json()
for each in res:
if each.startswith('_'):
continue
Expand All @@ -45,19 +48,15 @@ def _list_project(self):

def create_database(self, name):
url = self.base_url + name
res = requests.put(url,
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.put(url).json()
if 'error' in res and res['error'] == 'unauthorized':
raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password))
return res


def get_doc(self, db_name, doc_id):
url = self.base_url + db_name + "/" + doc_id
res = requests.get(url,
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.get(url).json()
if "error" in res and res["error"] == "not_found":
return None
return res
Expand All @@ -66,10 +65,7 @@ def get_doc(self, db_name, doc_id):
def get_docs(self, db_name, selector):
url = self.base_url + db_name + "/_find"
selector['use_index'] = self.index
res = requests.post(url,
data=json.dumps(selector),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.post(url, json=selector).json()
if 'error' in res and res['error'] == 'not_found':
return []
return res['docs']
Expand All @@ -81,10 +77,7 @@ def get_all_docs(self, db_name):

def insert_doc(self, db_name, doc_id, doc):
url = self.base_url + db_name + "/" + doc_id
return requests.put(url,
data=json.dumps(doc),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
return self.session.put(url, json=doc).json()


def update_doc(self, db_name, doc_id, new_doc):
Expand All @@ -94,14 +87,9 @@ def update_doc(self, db_name, doc_id, new_doc):
for key in new_doc:
doc[key] = new_doc[key]
url = self.base_url + db_name + "/" + doc_id
return requests.put(url,
data=json.dumps(doc),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
return self.session.put(url, json=doc).json()


def delete(self, url):
return requests.delete(url,
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
return self.session.delete(url).json()

41 changes: 13 additions & 28 deletions pyspider/database/couchdb/projectdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,19 @@
class ProjectDB(BaseProjectDB):
__collection_name__ = 'projectdb'

def __init__(self, url, database='projectdb', username='username', password='password'):
def __init__(self, url, database='projectdb', username=None, password=None):
self.username = username
self.password = password
self.url = url + self.__collection_name__ + "_" + database + "/"
self.database = database
self.insert('', {})

self.session = requests.session()
if username:
self.session.auth = HTTPBasicAuth(self.username, self.password)
self.session.headers.update({'Content-Type': 'application/json'})

# Create the db
res = requests.put(self.url,
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.put(self.url).json()
if 'error' in res and res['error'] == 'unauthorized':
raise Exception(
"Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'],
Expand All @@ -29,9 +31,7 @@ def __init__(self, url, database='projectdb', username='username', password='pas
},
'name': self.__collection_name__ + "_" + database
}
res = requests.post(self.url+"_index", data=json.dumps(payload),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.post(self.url + "_index", json=payload).json()
self.index = res['id']

def _default_fields(self, each):
Expand All @@ -51,10 +51,7 @@ def insert(self, name, obj={}):
obj = dict(obj)
obj['name'] = name
obj['updatetime'] = time.time()
res = requests.put(url,
data = json.dumps(obj),
headers = {"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.put(url, json=obj).json()
return res

def update(self, name, obj={}, **kwargs):
Expand All @@ -78,10 +75,7 @@ def get_all(self, fields=None):
"use_index": self.index
}
url = self.url + "_find"
res = requests.post(url,
data=json.dumps(payload),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.post(url, json=payload).json()
for doc in res['docs']:
yield self._default_fields(doc)

Expand All @@ -95,10 +89,7 @@ def get(self, name, fields=None):
"use_index": self.index
}
url = self.url + "_find"
res = requests.post(url,
data=json.dumps(payload),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.post(url, json=payload).json()
if len(res['docs']) == 0:
return None
return self._default_fields(res['docs'][0])
Expand All @@ -115,13 +106,7 @@ def drop(self, name):
doc = self.get(name)
payload = {"rev": doc["_rev"]}
url = self.url + name
return requests.delete(url,
params=payload,
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
return self.session.delete(url, params=payload).json()

def drop_database(self):
return requests.delete(self.url,
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()

return self.session.delete(self.url).json()
13 changes: 5 additions & 8 deletions pyspider/database/couchdb/resultdb.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import time, json, requests
from requests.auth import HTTPBasicAuth
import time, json
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from .couchdbbase import SplitTableMixin


class ResultDB(SplitTableMixin, BaseResultDB):
collection_prefix = ''

def __init__(self, url, database='resultdb', username='username', password='password'):
def __init__(self, url, database='resultdb', username=None, password=None):
self.username = username
self.password = password

self.base_url = url
self.url = url + database + "/"
self.database = database

super().__init__()
self.create_database(database)
self.index = None

Expand All @@ -31,10 +31,7 @@ def _create_project(self, project):
'name': collection_name
}

res = requests.post(self.base_url + collection_name + "/_index",
data=json.dumps(payload),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
self.index = res['id']
self._list_project()

Expand Down
14 changes: 6 additions & 8 deletions pyspider/database/couchdb/taskdb.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import json, time, requests
from requests.auth import HTTPBasicAuth
import json, time
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from .couchdbbase import SplitTableMixin


class TaskDB(SplitTableMixin, BaseTaskDB):
collection_prefix = ''

def __init__(self, url, database='taskdb', username='username', password='password'):
def __init__(self, url, database='taskdb', username=None, password=None):
self.username = username
self.password = password
self.base_url = url
self.url = url + database + "/"
self.database = database
self.create_database(database)
self.index = None

super().__init__()

self.create_database(database)
self.projects = set()
self._list_project()

Expand All @@ -32,10 +33,7 @@ def _create_project(self, project):
},
'name': collection_name
}
res = requests.post(self.base_url + collection_name + "/_index",
data=json.dumps(payload),
headers={"Content-Type": "application/json"},
auth=HTTPBasicAuth(self.username, self.password)).json()
res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
self.index = res['id']
self._list_project()

Expand Down
12 changes: 6 additions & 6 deletions pyspider/libs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,9 +432,9 @@ def python_console(namespace=None):


def check_port_open(port, addr='127.0.0.1'):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex((addr, port))
if result == 0:
return True
else:
return False
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
result = sock.connect_ex((addr, port))
if result == 0:
return True
else:
return False
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Flask==0.10
Jinja2==2.7
chardet==2.2.1
chardet==3.0.4
cssselect==0.9
lxml==4.3.3
pycurl==7.43.0.3
pyquery==1.4.0
requests==2.2
requests==2.24.0
tornado==4.5.3
mysql-connector-python==8.0.16
pika==1.1.0
Expand Down
Loading

0 comments on commit 897891c

Please sign in to comment.