Skip to content

Commit

Permalink
Fixes to schema positioning
Browse files Browse the repository at this point in the history
  • Loading branch information
tiberiuichim committed Aug 17, 2017
1 parent 37c2817 commit b8623f7
Show file tree
Hide file tree
Showing 9 changed files with 39 additions and 17 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,4 @@ pip-selfcheck.json
corpus/*
examples/*
src/eea.corpus/.cache/*
src/eea.corpus/eea.corpus.egg-info
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ WORKDIR /src/eea.corpus

RUN pip --no-cache-dir install -U -r requirements.txt

RUN python -m spacy.en.download all

# convert phrasemachine to python3 code
RUN cd /usr/local/lib/python3.5/site-packages/phrasemachine \
&& 2to3 -w *.py
Expand All @@ -18,8 +20,6 @@ WORKDIR /src/eea.corpus

RUN pip install -e ".[testing]"

RUN python -m spacy.en.download all

CMD pserve /src/eea.corpus/development.ini

EXPOSE 6543
5 changes: 3 additions & 2 deletions devel-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ services:
environment:
- REDIS_URL=redis://redis:6379/0
volumes:
- ./src:/src
- ./src/eea.corpus/eea:/src/eea.corpus/eea
- ./src/setup.py:/src/setup.py
- corpus-data:/corpus
# - ./corpus:/corpus

Expand All @@ -19,7 +20,7 @@ services:

debugworker:
build: .
command: sh -c "python setup.py develop && worker development.ini"
command: sh -c "worker development.ini"
environment:
- REDIS_URL=redis://redis:6379/0
volumes:
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ services:
- REDIS_URL=redis://redis:6379/0
volumes:
- corpus-data:/corpus
command: sh -c "python setup.py develop && pserve production.ini"
command: sh -c "pserve production.ini"

redis:
image: redis

worker:
image: eeacms/corpus:pyramid_service
command: sh -c "python setup.py develop && worker production.ini"
command: sh -c "worker production.ini"
environment:
- REDIS_URL=redis://redis:6379/0
volumes:
Expand Down
4 changes: 3 additions & 1 deletion src/eea.corpus/eea/corpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from eea.corpus.processing import build_pipeline
from eea.corpus.utils import corpus_base_path
from eea.corpus.utils import metadata
from eea.corpus.utils import to_text
from rq.decorators import job
import json
import logging
Expand Down Expand Up @@ -42,7 +43,8 @@ def build_corpus(pipeline, corpus_id, file_name, text_column, **kw):
content_stream = build_pipeline(file_name, text_column, pipeline,
preview_mode=False)

content = (to_text(doc) for doc in content_stream)
# TODO: save metadata stream
corpus = textacy.Corpus('en', texts=content_stream)
corpus = textacy.Corpus('en', texts=content)
corpus.save(cpath, name=corpus_id)
save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw)
9 changes: 8 additions & 1 deletion src/eea.corpus/eea/corpus/processing/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,11 @@ def process(content, env, **settings):
)
continue

yield to_doc(clean)
try:
yield to_doc(clean)
except Exception:
logger.exception(
"BS4 Processor: got an error converting to Doc: %r",
doc
)
continue
6 changes: 4 additions & 2 deletions src/eea.corpus/eea/corpus/processing/phrases/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,16 @@ def produce_phrases(content, env, settings):
# something wrong with the job, forcing build phrases
phrase_model_pipeline = get_pipeline_for_component(env)
logger.info("Phrase processor: producing phrase model %s", phash_id)
build_phrases(
job = build_phrases(
phrase_model_pipeline,
file_name,
text_column,
phash_id,
settings,
meta={'phash_id': phash_id},
)
job.meta = {'phash_id': phash_id},
job.save_meta()


yield from cached_phrases(content, env, settings)

Expand Down
12 changes: 12 additions & 0 deletions src/eea.corpus/eea/corpus/tests/processing/test_html.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from unittest.mock import patch


class TestHTML:
texts = (
"<strong>Hello</strong> world",
Expand Down Expand Up @@ -45,3 +48,12 @@ def test_from_doc(self, doc_content_stream):
doc = next(stream)
assert isinstance(doc, Doc)
assert doc.text.startswith('assessment-2 Use of freshwater resources')

@patch('eea.corpus.processing.html.to_doc')
def test_to_doc_with_error(self, to_doc):
from eea.corpus.processing.html import process

to_doc.side_effect = ValueError()

stream = process(['hello', 'world'], {})
assert list(stream) == []
11 changes: 4 additions & 7 deletions src/eea.corpus/eea/corpus/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ def get_pipeline_components(self):
p = pipeline_registry[_type]
kwargs = state[k].copy()
kwargs.pop('schema_type')
position = kwargs.pop('schema_position')
schemas[position] = (p.name, kwargs)
pos = kwargs.pop('schema_position')
schemas[pos] = (p.name, k, kwargs)

return [schemas[k] for k in sorted(schemas.keys())]

Expand All @@ -197,9 +197,6 @@ def _extract_pipeline_schemas(self):
p = pipeline_registry[_type]
s = p.schema(name=k, title=p.title)
pos = v.pop('schema_position')
# TODO: remove schema_position from passed form data in show()
# if pos in schemas:
# import pdb; pdb.set_trace()
schemas[pos] = s

# Handle subschemas clicked buttons: perform apropriate operations
Expand Down Expand Up @@ -291,7 +288,7 @@ def show(self, form):
# positions
for k, v in appstruct.items():
if isinstance(v, dict): # TODO: may not be correct in all cases
if v.get('schema_position'):
if v.get('schema_position') is not None:
v['schema_position'] = schema[k]['schema_position'].default

# now add new schemas, at the end of all others
Expand All @@ -300,7 +297,7 @@ def show(self, form):
p = pipeline_registry[add_component]
s = p.schema(name=rand(10), title=p.title,)
f = s['schema_position']
f.default = f.missing = len(schema.children)
f.default = f.missing = 999 # len(schema.children)
schema.add(s)
appstruct['pipeline_components'] = ''

Expand Down

0 comments on commit b8623f7

Please sign in to comment.