Skip to content

Commit

Permalink
Add big result export
Browse files Browse the repository at this point in the history
  • Loading branch information
marians committed Nov 26, 2018
1 parent 61c555f commit ae40915
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 16 deletions.
96 changes: 80 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import collections
from datetime import datetime
from os import getenv
from wsgiref import simple_server
Expand All @@ -16,34 +17,68 @@
webscreenshots_kind = 'webscreenshot'


def convert_datastore_datetime(field):
"""
return datetime in different ways, depending on whether the lib returns
a str, int, or datetime.datetime
"""
dt = ''
if type(field) == datetime:
dt = field
elif type(field) == int:
dt = datetime.utcfromtimestamp(field / 1000000)
elif type(field) == str:
dt = datetime.utcfromtimestamp(int(field) / 1000000)
return dt


def flatten(d, parent_key='', sep='.'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)


def get_compact_results(client):
query = client.query(kind=spider_results_kind,
order=['-created'],
#projection=['created', 'meta', 'score'],
)
order=['-created'],
#projection=['created', 'meta', 'score'],
)

out = []
for entity in query.fetch(eventual=True):
created = convert_datastore_datetime(entity.get('created'))

# handle creation date in different ways, depending on whether the lib returns
# a str, int, or datetime.datetime
created = entity.get('created')
dt = ''
if type(created) == datetime:
dt = created
elif type(created) == int:
dt = datetime.utcfromtimestamp(created / 1000000)
elif type(created) == str:
dt = datetime.utcfromtimestamp(int(created) / 1000000)

out.append({
'input_url': entity.key.name,
'created': dt.isoformat(),
'created': created.isoformat(),
'meta': entity.get('meta'),
'score': entity.get('score'),
})
return out



def get_full_results(client):
query = client.query(kind=spider_results_kind)

out = []
for entity in query.fetch(eventual=True):
created = convert_datastore_datetime(entity.get('created'))

record = {
'input_url': entity.key.name,
'created': created.isoformat(),
'score': entity.get('score'),
}
record.update(flatten(entity.get('meta'), parent_key='meta'))
record.update(flatten(entity.get('rating'), parent_key='rating'))
out.append(record)
return out


class LastUpdated(object):

Expand Down Expand Up @@ -78,6 +113,19 @@ def on_get(self, req, resp):
resp.media = out


class BigResults(object):

def on_get(self, req, resp):
"""
Returns big sites results
"""
out = get_full_results(datastore_client)

maxage = 48 * 60 * 60 # two days
resp.cache_control = ["max_age=%d" % maxage]
resp.media = out


class SiteDetails(object):

def on_get(self, req, resp):
Expand Down Expand Up @@ -128,6 +176,20 @@ def on_get(self, req, resp):
resp.media = entities


class Index(object):
def on_get(self, req, resp):
resp.media = {
"message": "This is green-spider-api",
"url": "https://github.com/netzbegruenung/green-spider-api",
"endpoints": [
"/api/v1/spider-results/last-updated/",
"/api/v1/spider-results/big/",
"/api/v1/spider-results/compact/",
"/api/v1/spider-results/site",
"/api/v1/screenshots/site",
]
}

handlers = media.Handlers({
'application/json': jsonhandler.JSONHandler(),
})
Expand All @@ -139,8 +201,10 @@ def on_get(self, req, resp):

app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
app.add_route('/api/v1/spider-results/compact/', CompactResults())
app.add_route('/api/v1/spider-results/big/', BigResults())
app.add_route('/api/v1/spider-results/site', SiteDetails())
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
app.add_route('/', Index())


if __name__ == '__main__':
Expand Down
31 changes: 31 additions & 0 deletions main_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import unittest
from main import flatten

class TestFlattenDict(unittest.TestCase):

def test_flatten(self):
input = {
"foo": {
"bar": {
"one": 1,
"two": 2,
}
},
"bar": {
"one": 1,
"two": 2,
}
}
expected = {
"foo.bar.one": 1,
"foo.bar.two": 2,
"bar.one": 1,
"bar.two": 2,
}
out = flatten(input)
self.assertEqual(out, expected)



if __name__ == '__main__':
unittest.main()

0 comments on commit ae40915

Please sign in to comment.