From ba83064608f2d6c18ec7d86e49269c1da565d9b4 Mon Sep 17 00:00:00 2001 From: Harsh J Date: Thu, 28 Apr 2011 11:57:14 +0530 Subject: [PATCH] HUE-1. Add avro file viewer support to File Browser. --- apps/filebrowser/src/filebrowser/static/css/fb.css | 4 +- .../src/filebrowser/templates/display.mako | 6 +- apps/filebrowser/src/filebrowser/views.py | 137 +++- apps/filebrowser/src/filebrowser/views_test.py | 60 ++ desktop/core/ext-py/avro-1.5.0/PKG-INFO | 11 + desktop/core/ext-py/avro-1.5.0/setup.py | 40 + .../core/ext-py/avro-1.5.0/src/avro/__init__.py | 18 + .../core/ext-py/avro-1.5.0/src/avro/datafile.py | 331 ++++++++ desktop/core/ext-py/avro-1.5.0/src/avro/io.py | 877 ++++++++++++++++++++ desktop/core/ext-py/avro-1.5.0/src/avro/ipc.py | 510 ++++++++++++ .../core/ext-py/avro-1.5.0/src/avro/protocol.py | 222 +++++ desktop/core/ext-py/avro-1.5.0/src/avro/schema.py | 707 ++++++++++++++++ desktop/core/ext-py/avro-1.5.0/src/avro/tool.py | 160 ++++ desktop/core/ext-py/avro-1.5.0/src/avro/txipc.py | 222 +++++ .../core/ext-py/avro-1.5.0/test/test_datafile.py | 149 ++++ .../avro-1.5.0/test/test_datafile_interop.py | 39 + desktop/core/ext-py/avro-1.5.0/test/test_io.py | 337 ++++++++ desktop/core/ext-py/avro-1.5.0/test/test_ipc.py | 31 + .../core/ext-py/avro-1.5.0/test/test_protocol.py | 422 ++++++++++ desktop/core/ext-py/avro-1.5.0/test/test_schema.py | 394 +++++++++ 20 files changed, 4641 insertions(+), 36 deletions(-) create mode 100644 desktop/core/ext-py/avro-1.5.0/PKG-INFO create mode 100644 desktop/core/ext-py/avro-1.5.0/setup.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/__init__.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/datafile.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/io.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/ipc.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/protocol.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/schema.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/tool.py create mode 100644 desktop/core/ext-py/avro-1.5.0/src/avro/txipc.py create mode 100644 desktop/core/ext-py/avro-1.5.0/test/test_datafile.py create mode 100644 desktop/core/ext-py/avro-1.5.0/test/test_datafile_interop.py create mode 100644 desktop/core/ext-py/avro-1.5.0/test/test_io.py create mode 100644 desktop/core/ext-py/avro-1.5.0/test/test_ipc.py create mode 100644 desktop/core/ext-py/avro-1.5.0/test/test_protocol.py create mode 100644 desktop/core/ext-py/avro-1.5.0/test/test_schema.py diff --git a/apps/filebrowser/src/filebrowser/static/css/fb.css b/apps/filebrowser/src/filebrowser/static/css/fb.css index 13379d5..596b9a9 100644 --- a/apps/filebrowser/src/filebrowser/static/css/fb.css +++ b/apps/filebrowser/src/filebrowser/static/css/fb.css @@ -424,7 +424,7 @@ div.fileviewer .fv-viewLocation { background: url(/static/art/icons/folder_go.png) left 50%; } -div.fileviewer .fv-viewGzip { +div.fileviewer .fv-viewGzip .fv-viewAvro { background: url(/static/art/icons/page_white_zip.png) left 50%; } @@ -541,4 +541,4 @@ div.fileeditor .fe-buttons { .fs-locationInput { width: 400px; margin-left: 5px; -} \ No newline at end of file +} diff --git a/apps/filebrowser/src/filebrowser/templates/display.mako b/apps/filebrowser/src/filebrowser/templates/display.mako index 9c61078..b3b93af 100644 --- a/apps/filebrowser/src/filebrowser/templates/display.mako +++ b/apps/filebrowser/src/filebrowser/templates/display.mako @@ -43,6 +43,10 @@ Preview As Gzip % endif + % if view['compression'] != "avro" and path.endswith('.avro'): + Preview As Avro + % endif + % if view['compression'] and view['compression'] != "none": Stop preview % endif @@ -56,7 +60,7 @@
- % if not view['compression'] or view['compression'] == "none": + % if not view['compression'] or view['compression'] in ("none", "avro"):
Viewing Bytes: diff --git a/apps/filebrowser/src/filebrowser/views.py b/apps/filebrowser/src/filebrowser/views.py index cac2ce1..84e44ca 100644 --- a/apps/filebrowser/src/filebrowser/views.py +++ b/apps/filebrowser/src/filebrowser/views.py @@ -36,6 +36,7 @@ from django.utils.http import http_date, urlquote from django.utils.html import escape from cStringIO import StringIO from gzip import GzipFile +from avro import datafile, io from desktop.lib import i18n from desktop.lib.django_util import make_absolute, render_json @@ -400,34 +401,12 @@ def display(request, path): if length > MAX_CHUNK_SIZE_BYTES: raise PopupException("Cannot request chunks greater than %d bytes" % MAX_CHUNK_SIZE_BYTES) - # Auto gzip detection, unless we are explicitly told to view binary - if not compression and mode != 'binary': - if path.endswith('.gz') and detect_gzip(request.fs.open(path).read(2)): - compression = 'gzip' - offset = 0 - else: - compression = 'none' - - f = request.fs.open(path) - - if compression == 'gzip': - if offset and offset != 0: - raise PopupException("We don't support offset and gzip Compression") - try: - try: - contents = GzipFile('', 'r', 0, StringIO(f.read())).read(length) - except: - logging.warn("Could not decompress file at %s" % path, exc_info=True) - contents = '' - raise PopupException("Failed to decompress file") - finally: - f.close() - else: - try: - f.seek(offset) - contents = f.read(length) - finally: - f.close() + # Do not decompress in binary mode. + if mode == 'binary': + compression = 'none' + # Read out based on meta. + compression, offset, length, contents = \ + read_contents(compression, path, request.fs, offset, length) # Get contents as string for text mode, or at least try uni_contents = None @@ -471,11 +450,103 @@ def display(request, path): return render_with_toolbars("display.mako", request, data) +def read_contents(codec_type, path, fs, offset, length): + '''Reads contents of a passed path, by appropriately decoding the data. + Arguments: + codec_type - The type of codec to use to decode. (Auto-detected if None). + path - The path of the file to read. + fs - The FileSystem instance to use to read. + offset - Offset to seek to before read begins. + length - Amount of bytes to read after offset. + Returns: A tuple of codec_type, offset, length and contents read. + ''' + # Auto codec detection for [gzip, avro, none] + # Only done when codec_type is unset + if not codec_type: + if path.endswith('.gz') and detect_gzip(fs.open(path).read(2)): + codec_type = 'gzip' + offset = 0 + elif path.endswith('.avro') and detect_avro(fs.open(path).read(3)): + codec_type = 'avro' + else: + codec_type = 'none' + + f = fs.open(path) + contents = '' + + if codec_type == 'gzip': + contents = _read_gzip(fs, path, offset, length) + elif codec_type == 'avro': + contents = _read_avro(fs, path, offset, length) + else: + # for 'none' type. + contents = _read_simple(fs, path, offset, length) + + return (codec_type, offset, length, contents) + +def _read_avro(fs, path, offset, length): + contents = '' + try: + fhandle = fs.open(path) + try: + fhandle.seek(offset) + data_file_reader = datafile.DataFileReader(fhandle, io.DatumReader()) + contents_list = [] + read_start = fhandle.tell() + # Iterate over the entire sought file. + for datum in data_file_reader: + read_length = fhandle.tell() - read_start + if read_length > length and len(contents_list) > 0: + break + else: + datum_str = str(datum) + "\n" + contents_list.append(datum_str) + data_file_reader.close() + contents = "".join(contents_list) + except: + logging.warn("Could not read avro file at %s" % path, exc_info=True) + raise PopupException("Failed to read Avro file") + finally: + fhandle.close() + return contents + +def _read_gzip(fs, path, offset, length): + contents = '' + if offset and offset != 0: + raise PopupException("We don't support offset and gzip Compression") + try: + fhandle = fs.open(path) + try: + contents = GzipFile('', 'r', 0, StringIO(fhandle.read())).read(length) + except: + logging.warn("Could not decompress file at %s" % path, exc_info=True) + raise PopupException("Failed to decompress file") + finally: + fhandle.close() + return contents + +def _read_simple(fs, path, offset, length): + contents = '' + try: + fhandle = fs.open(path) + try: + fhandle.seek(offset) + contents = fhandle.read(length) + except: + logging.warn("Could not read file at %s" % path, exc_info=True) + raise PopupException("Failed to read file") + finally: + fhandle.close() + return contents + def detect_gzip(contents): - ''' This is a silly small function which checks to see if the file is Gzip''' - if contents[:2] == '\x1f\x8b': - return True - return False + '''This is a silly small function which checks to see if the file is Gzip''' + return contents[:2] == '\x1f\x8b' + +def detect_avro(contents): + '''This is a silly small function which checks to see if the file is Avro''' + # Check if the first three bytes are 'O', 'b' and 'j' + return contents[:3] == '\x4F\x62\x6A' def _calculate_navigation(offset, length, size): """ @@ -730,4 +801,4 @@ def truncate(toTruncate, charsToKeep=50): truncated = toTruncate[:charsToKeep] + "..." return truncated else: - return toTruncate + return toTruncate diff --git a/apps/filebrowser/src/filebrowser/views_test.py b/apps/filebrowser/src/filebrowser/views_test.py index 1a28ef3..fabc2f9 100644 --- a/apps/filebrowser/src/filebrowser/views_test.py +++ b/apps/filebrowser/src/filebrowser/views_test.py @@ -20,6 +20,7 @@ Tests for filebrowser views """ from nose.plugins.attrib import attr from hadoop import mini_cluster +from avro import schema, datafile, io from desktop.lib.django_test_util import make_logged_in_client from nose.tools import assert_true, assert_false, assert_equal import logging @@ -105,6 +106,65 @@ def test_listdir(): pass # Don't let cleanup errors mask earlier failures cluster.shutdown() +@attr('requires_hadoop') +def test_view_avro(): + cluster = mini_cluster.shared_cluster(conf=True) + try: + c = make_logged_in_client() + cluster.fs.setuser(cluster.superuser) + if cluster.fs.isdir("/test-avro-filebrowser"): + cluster.fs.rmtree('/test-avro-filebrowser/') + + cluster.fs.mkdir('/test-avro-filebrowser/') + + test_schema = schema.parse(""" + { + "name": "test", + "type": "record", + "fields": [ + { "name": "name", "type": "string" }, + { "name": "integer", "type": "int" } + ] + } + """) + + f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w") + data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), + writers_schema=test_schema, + codec='deflate') + dummy_datum = { + 'name': 'Test', + 'integer': 10, + } + data_file_writer.append(dummy_datum) + data_file_writer.close() + + # autodetect + response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro') + assert_equal(response.context['view']['contents'], "{u'integer': 10, u'name': u'Test'}\n") + + # offsetting should work as well + response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1') + assert_true(response.context.has_key('view')) + + f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w") + f.write("hello") + f.close() + + # we shouldn't autodetect non avro files + response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro') + assert_equal(response.context['view']['contents'], "hello") + + # we should fail to do a bad thing if they specify compression when it's not set. + response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip') + assert_false(response.context.has_key('view')) + + finally: + try: + cluster.fs.rmtree('/test-avro-filebrowser/') + except: + pass # Don't let cleanup errors mask earlier failures + cluster.shutdown() @attr('requires_hadoop') def test_view_gz(): diff --git a/desktop/core/ext-py/avro-1.5.0/PKG-INFO b/desktop/core/ext-py/avro-1.5.0/PKG-INFO new file mode 100644 index 0000000..6274d06 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/PKG-INFO @@ -0,0 +1,11 @@ +Metadata-Version: 1.0 +Name: avro +Version: 1.5.0 +Summary: Avro is a serialization and RPC framework. +Home-page: http://hadoop.apache.org/avro +Author: Apache Avro +Author-email: avro-dev@hadoop.apache.org +License: Apache License 2.0 +Description: UNKNOWN +Keywords: avro serialization rpc +Platform: UNKNOWN diff --git a/desktop/core/ext-py/avro-1.5.0/setup.py b/desktop/core/ext-py/avro-1.5.0/setup.py new file mode 100644 index 0000000..27e5eaa --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/setup.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +setup( + name = 'avro', + version = '1.5.0', + packages = ['avro',], + package_dir = {'avro': 'src/avro'}, + + # Project uses simplejson, so ensure that it gets installed or upgraded + # on the target machine + install_requires = ['simplejson >= 2.0.9'], + + # metadata for upload to PyPI + author = 'Apache Avro', + author_email = 'avro-dev@hadoop.apache.org', + description = 'Avro is a serialization and RPC framework.', + license = 'Apache License 2.0', + keywords = 'avro serialization rpc', + url = 'http://hadoop.apache.org/avro', +) diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/__init__.py b/desktop/core/ext-py/avro-1.5.0/src/avro/__init__.py new file mode 100644 index 0000000..da51d9b --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['schema', 'io', 'datafile', 'protocol', 'ipc'] + diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/datafile.py b/desktop/core/ext-py/avro-1.5.0/src/avro/datafile.py new file mode 100644 index 0000000..f81c7e2 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/datafile.py @@ -0,0 +1,331 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Read/Write Avro File Object Containers. +""" +import zlib +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from avro import schema +from avro import io + +# +# Constants +# + +VERSION = 1 +MAGIC = 'Obj' + chr(VERSION) +MAGIC_SIZE = len(MAGIC) +SYNC_SIZE = 16 +SYNC_INTERVAL = 1000 * SYNC_SIZE # TODO(hammer): make configurable +META_SCHEMA = schema.parse("""\ +{"type": "record", "name": "org.apache.avro.file.Header", + "fields" : [ + {"name": "magic", "type": {"type": "fixed", "name": "magic", "size": %d}}, + {"name": "meta", "type": {"type": "map", "values": "bytes"}}, + {"name": "sync", "type": {"type": "fixed", "name": "sync", "size": %d}}]} +""" % (MAGIC_SIZE, SYNC_SIZE)) +VALID_CODECS = ['null', 'deflate'] +VALID_ENCODINGS = ['binary'] # not used yet + +CODEC_KEY = "avro.codec" +SCHEMA_KEY = "avro.schema" + +# +# Exceptions +# + +class DataFileException(schema.AvroException): + """ + Raised when there's a problem reading or writing file object containers. + """ + def __init__(self, fail_msg): + schema.AvroException.__init__(self, fail_msg) + +# +# Write Path +# + +class DataFileWriter(object): + @staticmethod + def generate_sync_marker(): + return generate_sixteen_random_bytes() + + # TODO(hammer): make 'encoder' a metadata property + def __init__(self, writer, datum_writer, writers_schema=None, codec='null'): + """ + If the schema is not present, presume we're appending. + + @param writer: File-like object to write into. + """ + self._writer = writer + self._encoder = io.BinaryEncoder(writer) + self._datum_writer = datum_writer + self._buffer_writer = StringIO() + self._buffer_encoder = io.BinaryEncoder(self._buffer_writer) + self._block_count = 0 + self._meta = {} + + if writers_schema is not None: + if codec not in VALID_CODECS: + raise DataFileException("Unknown codec: %r" % codec) + self._sync_marker = DataFileWriter.generate_sync_marker() + self.set_meta('avro.codec', codec) + self.set_meta('avro.schema', str(writers_schema)) + self.datum_writer.writers_schema = writers_schema + self._write_header() + else: + # open writer for reading to collect metadata + dfr = DataFileReader(writer, io.DatumReader()) + + # TODO(hammer): collect arbitrary metadata + # collect metadata + self._sync_marker = dfr.sync_marker + self.set_meta('avro.codec', dfr.get_meta('avro.codec')) + + # get schema used to write existing file + schema_from_file = dfr.get_meta('avro.schema') + self.set_meta('avro.schema', schema_from_file) + self.datum_writer.writers_schema = schema.parse(schema_from_file) + + # seek to the end of the file and prepare for writing + writer.seek(0, 2) + + # read-only properties + writer = property(lambda self: self._writer) + encoder = property(lambda self: self._encoder) + datum_writer = property(lambda self: self._datum_writer) + buffer_writer = property(lambda self: self._buffer_writer) + buffer_encoder = property(lambda self: self._buffer_encoder) + sync_marker = property(lambda self: self._sync_marker) + meta = property(lambda self: self._meta) + + # read/write properties + def set_block_count(self, new_val): + self._block_count = new_val + block_count = property(lambda self: self._block_count, set_block_count) + + # utility functions to read/write metadata entries + def get_meta(self, key): + return self._meta.get(key) + def set_meta(self, key, val): + self._meta[key] = val + + def _write_header(self): + header = {'magic': MAGIC, + 'meta': self.meta, + 'sync': self.sync_marker} + self.datum_writer.write_data(META_SCHEMA, header, self.encoder) + + # TODO(hammer): make a schema for blocks and use datum_writer + def _write_block(self): + if self.block_count > 0: + # write number of items in block + self.encoder.write_long(self.block_count) + + # write block contents + uncompressed_data = self.buffer_writer.getvalue() + if self.get_meta(CODEC_KEY) == 'null': + compressed_data = uncompressed_data + elif self.get_meta(CODEC_KEY) == 'deflate': + # The first two characters and last character are zlib + # wrappers around deflate data. + compressed_data = zlib.compress(uncompressed_data)[2:-1] + else: + fail_msg = '"%s" codec is not supported.' % self.get_meta(CODEC_KEY) + raise DataFileException(fail_msg) + + # Write length of block + self.encoder.write_long(len(compressed_data)) + + # Write block + self.writer.write(compressed_data) + + # write sync marker + self.writer.write(self.sync_marker) + + # reset buffer + self.buffer_writer.truncate(0) + self.block_count = 0 + + def append(self, datum): + """Append a datum to the file.""" + self.datum_writer.write(datum, self.buffer_encoder) + self.block_count += 1 + + # if the data to write is larger than the sync interval, write the block + if self.buffer_writer.tell() >= SYNC_INTERVAL: + self._write_block() + + def sync(self): + """ + Return the current position as a value that may be passed to + DataFileReader.seek(long). Forces the end of the current block, + emitting a synchronization marker. + """ + self._write_block() + return self.writer.tell() + + def flush(self): + """Flush the current state of the file, including metadata.""" + self._write_block() + self.writer.flush() + + def close(self): + """Close the file.""" + self.flush() + self.writer.close() + +class DataFileReader(object): + """Read files written by DataFileWriter.""" + # TODO(hammer): allow user to specify expected schema? + # TODO(hammer): allow user to specify the encoder + def __init__(self, reader, datum_reader): + self._reader = reader + self._raw_decoder = io.BinaryDecoder(reader) + self._datum_decoder = None # Maybe reset at every block. + self._datum_reader = datum_reader + + # read the header: magic, meta, sync + self._read_header() + + # ensure codec is valid + self.codec = self.get_meta('avro.codec') + if self.codec is None: + self.codec = "null" + if self.codec not in VALID_CODECS: + raise DataFileException('Unknown codec: %s.' % self.codec) + + # get file length + self._file_length = self.determine_file_length() + + # get ready to read + self._block_count = 0 + self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY)) + + def __iter__(self): + return self + + # read-only properties + reader = property(lambda self: self._reader) + raw_decoder = property(lambda self: self._raw_decoder) + datum_decoder = property(lambda self: self._datum_decoder) + datum_reader = property(lambda self: self._datum_reader) + sync_marker = property(lambda self: self._sync_marker) + meta = property(lambda self: self._meta) + file_length = property(lambda self: self._file_length) + + # read/write properties + def set_block_count(self, new_val): + self._block_count = new_val + block_count = property(lambda self: self._block_count, set_block_count) + + # utility functions to read/write metadata entries + def get_meta(self, key): + return self._meta.get(key) + def set_meta(self, key, val): + self._meta[key] = val + + def determine_file_length(self): + """ + Get file length and leave file cursor where we found it. + """ + remember_pos = self.reader.tell() + self.reader.seek(0, 2) + file_length = self.reader.tell() + self.reader.seek(remember_pos) + return file_length + + def is_EOF(self): + return self.reader.tell() == self.file_length + + def _read_header(self): + # seek to the beginning of the file to get magic block + self.reader.seek(0, 0) + + # read header into a dict + header = self.datum_reader.read_data( + META_SCHEMA, META_SCHEMA, self.raw_decoder) + + # check magic number + if header.get('magic') != MAGIC: + fail_msg = "Not an Avro data file: %s doesn't match %s."\ + % (header.get('magic'), MAGIC) + raise schema.AvroException(fail_msg) + + # set metadata + self._meta = header['meta'] + + # set sync marker + self._sync_marker = header['sync'] + + def _read_block_header(self): + self.block_count = self.raw_decoder.read_long() + if self.codec == "null": + # Skip a long; we don't need to use the length. + self.raw_decoder.skip_long() + self._datum_decoder = self._raw_decoder + else: + # Compressed data is stored as (length, data), which + # corresponds to how the "bytes" type is encoded. + data = self.raw_decoder.read_bytes() + # -15 is the log of the window size; negative indicates + # "raw" (no zlib headers) decompression. See zlib.h. + uncompressed = zlib.decompress(data, -15) + self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed)) + + def _skip_sync(self): + """ + Read the length of the sync marker; if it matches the sync marker, + return True. Otherwise, seek back to where we started and return False. + """ + proposed_sync_marker = self.reader.read(SYNC_SIZE) + if proposed_sync_marker != self.sync_marker: + self.reader.seek(-SYNC_SIZE, 1) + return False + else: + return True + + # TODO(hammer): handle block of length zero + # TODO(hammer): clean this up with recursion + def next(self): + """Return the next datum in the file.""" + if self.block_count == 0: + if self.is_EOF(): + raise StopIteration + elif self._skip_sync(): + if self.is_EOF(): raise StopIteration + self._read_block_header() + else: + self._read_block_header() + + datum = self.datum_reader.read(self.datum_decoder) + self.block_count -= 1 + return datum + + def close(self): + """Close this reader.""" + self.reader.close() + +def generate_sixteen_random_bytes(): + try: + import os + return os.urandom(16) + except: + import random + return [ chr(random.randrange(256)) for i in range(16) ] diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/io.py b/desktop/core/ext-py/avro-1.5.0/src/avro/io.py new file mode 100644 index 0000000..b7f0f86 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/io.py @@ -0,0 +1,877 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Input/Output utilities, including: + + * i/o-specific constants + * i/o-specific exceptions + * schema validation + * leaf value encoding and decoding + * datum reader/writer stuff (?) + +Also includes a generic representation for data, which +uses the following mapping: + + * Schema records are implemented as dict. + * Schema arrays are implemented as list. + * Schema maps are implemented as dict. + * Schema strings are implemented as unicode. + * Schema bytes are implemented as str. + * Schema ints are implemented as int. + * Schema longs are implemented as long. + * Schema floats are implemented as float. + * Schema doubles are implemented as float. + * Schema booleans are implemented as bool. +""" +import struct +from avro import schema +import sys + +try: + import json +except ImportError: + import simplejson as json + +# +# Constants +# + +INT_MIN_VALUE = -(1 << 31) +INT_MAX_VALUE = (1 << 31) - 1 +LONG_MIN_VALUE = -(1 << 63) +LONG_MAX_VALUE = (1 << 63) - 1 + +# TODO(hammer): shouldn't ! be < for little-endian (according to spec?) +if sys.version_info >= (2, 5, 0): + struct_class = struct.Struct +else: + class SimpleStruct(object): + def __init__(self, format): + self.format = format + def pack(self, *args): + return struct.pack(self.format, *args) + def unpack(self, *args): + return struct.unpack(self.format, *args) + struct_class = SimpleStruct + +STRUCT_INT = struct_class('!I') # big-endian unsigned int +STRUCT_LONG = struct_class('!Q') # big-endian unsigned long long +STRUCT_FLOAT = struct_class('!f') # big-endian float +STRUCT_DOUBLE = struct_class('!d') # big-endian double + +# +# Exceptions +# + +class AvroTypeException(schema.AvroException): + """Raised when datum is not an example of schema.""" + def __init__(self, expected_schema, datum): + pretty_expected = json.dumps(json.loads(str(expected_schema)), indent=2) + fail_msg = "The datum %s is not an example of the schema %s"\ + % (datum, pretty_expected) + schema.AvroException.__init__(self, fail_msg) + +class SchemaResolutionException(schema.AvroException): + def __init__(self, fail_msg, writers_schema=None, readers_schema=None): + pretty_writers = json.dumps(json.loads(str(writers_schema)), indent=2) + pretty_readers = json.dumps(json.loads(str(readers_schema)), indent=2) + if writers_schema: fail_msg += "\nWriter's Schema: %s" % pretty_writers + if readers_schema: fail_msg += "\nReader's Schema: %s" % pretty_readers + schema.AvroException.__init__(self, fail_msg) + +# +# Validate +# + +def validate(expected_schema, datum): + """Determine if a python datum is an instance of a schema.""" + schema_type = expected_schema.type + if schema_type == 'null': + return datum is None + elif schema_type == 'boolean': + return isinstance(datum, bool) + elif schema_type == 'string': + return isinstance(datum, basestring) + elif schema_type == 'bytes': + return isinstance(datum, str) + elif schema_type == 'int': + return ((isinstance(datum, int) or isinstance(datum, long)) + and INT_MIN_VALUE <= datum <= INT_MAX_VALUE) + elif schema_type == 'long': + return ((isinstance(datum, int) or isinstance(datum, long)) + and LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE) + elif schema_type in ['float', 'double']: + return (isinstance(datum, int) or isinstance(datum, long) + or isinstance(datum, float)) + elif schema_type == 'fixed': + return isinstance(datum, str) and len(datum) == expected_schema.size + elif schema_type == 'enum': + return datum in expected_schema.symbols + elif schema_type == 'array': + return (isinstance(datum, list) and + False not in [validate(expected_schema.items, d) for d in datum]) + elif schema_type == 'map': + return (isinstance(datum, dict) and + False not in [isinstance(k, basestring) for k in datum.keys()] and + False not in + [validate(expected_schema.values, v) for v in datum.values()]) + elif schema_type in ['union', 'error_union']: + return True in [validate(s, datum) for s in expected_schema.schemas] + elif schema_type in ['record', 'error', 'request']: + return (isinstance(datum, dict) and + False not in + [validate(f.type, datum.get(f.name)) for f in expected_schema.fields]) + +# +# Decoder/Encoder +# + +class BinaryDecoder(object): + """Read leaf values.""" + def __init__(self, reader): + """ + reader is a Python object on which we can call read, seek, and tell. + """ + self._reader = reader + + # read-only properties + reader = property(lambda self: self._reader) + + def read(self, n): + """ + Read n bytes. + """ + return self.reader.read(n) + + def read_null(self): + """ + null is written as zero bytes + """ + return None + + def read_boolean(self): + """ + a boolean is written as a single byte + whose value is either 0 (false) or 1 (true). + """ + return ord(self.read(1)) == 1 + + def read_int(self): + """ + int and long values are written using variable-length, zig-zag coding. + """ + return self.read_long() + + def read_long(self): + """ + int and long values are written using variable-length, zig-zag coding. + """ + b = ord(self.read(1)) + n = b & 0x7F + shift = 7 + while (b & 0x80) != 0: + b = ord(self.read(1)) + n |= (b & 0x7F) << shift + shift += 7 + datum = (n >> 1) ^ -(n & 1) + return datum + + def read_float(self): + """ + A float is written as 4 bytes. + The float is converted into a 32-bit integer using a method equivalent to + Java's floatToIntBits and then encoded in little-endian format. + """ + bits = (((ord(self.read(1)) & 0xffL)) | + ((ord(self.read(1)) & 0xffL) << 8) | + ((ord(self.read(1)) & 0xffL) << 16) | + ((ord(self.read(1)) & 0xffL) << 24)) + return STRUCT_FLOAT.unpack(STRUCT_INT.pack(bits))[0] + + def read_double(self): + """ + A double is written as 8 bytes. + The double is converted into a 64-bit integer using a method equivalent to + Java's doubleToLongBits and then encoded in little-endian format. + """ + bits = (((ord(self.read(1)) & 0xffL)) | + ((ord(self.read(1)) & 0xffL) << 8) | + ((ord(self.read(1)) & 0xffL) << 16) | + ((ord(self.read(1)) & 0xffL) << 24) | + ((ord(self.read(1)) & 0xffL) << 32) | + ((ord(self.read(1)) & 0xffL) << 40) | + ((ord(self.read(1)) & 0xffL) << 48) | + ((ord(self.read(1)) & 0xffL) << 56)) + return STRUCT_DOUBLE.unpack(STRUCT_LONG.pack(bits))[0] + + def read_bytes(self): + """ + Bytes are encoded as a long followed by that many bytes of data. + """ + return self.read(self.read_long()) + + def read_utf8(self): + """ + A string is encoded as a long followed by + that many bytes of UTF-8 encoded character data. + """ + return unicode(self.read_bytes(), "utf-8") + + def skip_null(self): + pass + + def skip_boolean(self): + self.skip(1) + + def skip_int(self): + self.skip_long() + + def skip_long(self): + b = ord(self.read(1)) + while (b & 0x80) != 0: + b = ord(self.read(1)) + + def skip_float(self): + self.skip(4) + + def skip_double(self): + self.skip(8) + + def skip_bytes(self): + self.skip(self.read_long()) + + def skip_utf8(self): + self.skip_bytes() + + def skip(self, n): + self.reader.seek(self.reader.tell() + n) + +class BinaryEncoder(object): + """Write leaf values.""" + def __init__(self, writer): + """ + writer is a Python object on which we can call write. + """ + self._writer = writer + + # read-only properties + writer = property(lambda self: self._writer) + + def write(self, datum): + """Write an abritrary datum.""" + self.writer.write(datum) + + def write_null(self, datum): + """ + null is written as zero bytes + """ + pass + + def write_boolean(self, datum): + """ + a boolean is written as a single byte + whose value is either 0 (false) or 1 (true). + """ + if datum: + self.write(chr(1)) + else: + self.write(chr(0)) + + def write_int(self, datum): + """ + int and long values are written using variable-length, zig-zag coding. + """ + self.write_long(datum); + + def write_long(self, datum): + """ + int and long values are written using variable-length, zig-zag coding. + """ + datum = (datum << 1) ^ (datum >> 63) + while (datum & ~0x7F) != 0: + self.write(chr((datum & 0x7f) | 0x80)) + datum >>= 7 + self.write(chr(datum)) + + def write_float(self, datum): + """ + A float is written as 4 bytes. + The float is converted into a 32-bit integer using a method equivalent to + Java's floatToIntBits and then encoded in little-endian format. + """ + bits = STRUCT_INT.unpack(STRUCT_FLOAT.pack(datum))[0] + self.write(chr((bits) & 0xFF)) + self.write(chr((bits >> 8) & 0xFF)) + self.write(chr((bits >> 16) & 0xFF)) + self.write(chr((bits >> 24) & 0xFF)) + + def write_double(self, datum): + """ + A double is written as 8 bytes. + The double is converted into a 64-bit integer using a method equivalent to + Java's doubleToLongBits and then encoded in little-endian format. + """ + bits = STRUCT_LONG.unpack(STRUCT_DOUBLE.pack(datum))[0] + self.write(chr((bits) & 0xFF)) + self.write(chr((bits >> 8) & 0xFF)) + self.write(chr((bits >> 16) & 0xFF)) + self.write(chr((bits >> 24) & 0xFF)) + self.write(chr((bits >> 32) & 0xFF)) + self.write(chr((bits >> 40) & 0xFF)) + self.write(chr((bits >> 48) & 0xFF)) + self.write(chr((bits >> 56) & 0xFF)) + + def write_bytes(self, datum): + """ + Bytes are encoded as a long followed by that many bytes of data. + """ + self.write_long(len(datum)) + self.write(struct.pack('%ds' % len(datum), datum)) + + def write_utf8(self, datum): + """ + A string is encoded as a long followed by + that many bytes of UTF-8 encoded character data. + """ + datum = datum.encode("utf-8") + self.write_bytes(datum) + +# +# DatumReader/Writer +# + +class DatumReader(object): + """Deserialize Avro-encoded data into a Python data structure.""" + @staticmethod + def check_props(schema_one, schema_two, prop_list): + for prop in prop_list: + if getattr(schema_one, prop) != getattr(schema_two, prop): + return False + return True + + @staticmethod + def match_schemas(writers_schema, readers_schema): + w_type = writers_schema.type + r_type = readers_schema.type + if 'union' in [w_type, r_type] or 'error_union' in [w_type, r_type]: + return True + elif (w_type in schema.PRIMITIVE_TYPES and r_type in schema.PRIMITIVE_TYPES + and w_type == r_type): + return True + elif (w_type == r_type == 'record' and + DatumReader.check_props(writers_schema, readers_schema, + ['fullname'])): + return True + elif (w_type == r_type == 'error' and + DatumReader.check_props(writers_schema, readers_schema, + ['fullname'])): + return True + elif (w_type == r_type == 'request'): + return True + elif (w_type == r_type == 'fixed' and + DatumReader.check_props(writers_schema, readers_schema, + ['fullname', 'size'])): + return True + elif (w_type == r_type == 'enum' and + DatumReader.check_props(writers_schema, readers_schema, + ['fullname'])): + return True + elif (w_type == r_type == 'map' and + DatumReader.check_props(writers_schema.values, + readers_schema.values, ['type'])): + return True + elif (w_type == r_type == 'array' and + DatumReader.check_props(writers_schema.items, + readers_schema.items, ['type'])): + return True + + # Handle schema promotion + if w_type == 'int' and r_type in ['long', 'float', 'double']: + return True + elif w_type == 'long' and r_type in ['float', 'double']: + return True + elif w_type == 'float' and r_type == 'double': + return True + return False + + def __init__(self, writers_schema=None, readers_schema=None): + """ + As defined in the Avro specification, we call the schema encoded + in the data the "writer's schema", and the schema expected by the + reader the "reader's schema". + """ + self._writers_schema = writers_schema + self._readers_schema = readers_schema + + # read/write properties + def set_writers_schema(self, writers_schema): + self._writers_schema = writers_schema + writers_schema = property(lambda self: self._writers_schema, + set_writers_schema) + def set_readers_schema(self, readers_schema): + self._readers_schema = readers_schema + readers_schema = property(lambda self: self._readers_schema, + set_readers_schema) + + def read(self, decoder): + if self.readers_schema is None: + self.readers_schema = self.writers_schema + return self.read_data(self.writers_schema, self.readers_schema, decoder) + + def read_data(self, writers_schema, readers_schema, decoder): + # schema matching + if not DatumReader.match_schemas(writers_schema, readers_schema): + fail_msg = 'Schemas do not match.' + raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) + + # schema resolution: reader's schema is a union, writer's schema is not + if (writers_schema.type not in ['union', 'error_union'] + and readers_schema.type in ['union', 'error_union']): + for s in readers_schema.schemas: + if DatumReader.match_schemas(writers_schema, s): + return self.read_data(writers_schema, s, decoder) + fail_msg = 'Schemas do not match.' + raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) + + # function dispatch for reading data based on type of writer's schema + if writers_schema.type == 'null': + return decoder.read_null() + elif writers_schema.type == 'boolean': + return decoder.read_boolean() + elif writers_schema.type == 'string': + return decoder.read_utf8() + elif writers_schema.type == 'int': + return decoder.read_int() + elif writers_schema.type == 'long': + return decoder.read_long() + elif writers_schema.type == 'float': + return decoder.read_float() + elif writers_schema.type == 'double': + return decoder.read_double() + elif writers_schema.type == 'bytes': + return decoder.read_bytes() + elif writers_schema.type == 'fixed': + return self.read_fixed(writers_schema, readers_schema, decoder) + elif writers_schema.type == 'enum': + return self.read_enum(writers_schema, readers_schema, decoder) + elif writers_schema.type == 'array': + return self.read_array(writers_schema, readers_schema, decoder) + elif writers_schema.type == 'map': + return self.read_map(writers_schema, readers_schema, decoder) + elif writers_schema.type in ['union', 'error_union']: + return self.read_union(writers_schema, readers_schema, decoder) + elif writers_schema.type in ['record', 'error', 'request']: + return self.read_record(writers_schema, readers_schema, decoder) + else: + fail_msg = "Cannot read unknown schema type: %s" % writers_schema.type + raise schema.AvroException(fail_msg) + + def skip_data(self, writers_schema, decoder): + if writers_schema.type == 'null': + return decoder.skip_null() + elif writers_schema.type == 'boolean': + return decoder.skip_boolean() + elif writers_schema.type == 'string': + return decoder.skip_utf8() + elif writers_schema.type == 'int': + return decoder.skip_int() + elif writers_schema.type == 'long': + return decoder.skip_long() + elif writers_schema.type == 'float': + return decoder.skip_float() + elif writers_schema.type == 'double': + return decoder.skip_double() + elif writers_schema.type == 'bytes': + return decoder.skip_bytes() + elif writers_schema.type == 'fixed': + return self.skip_fixed(writers_schema, decoder) + elif writers_schema.type == 'enum': + return self.skip_enum(writers_schema, decoder) + elif writers_schema.type == 'array': + return self.skip_array(writers_schema, decoder) + elif writers_schema.type == 'map': + return self.skip_map(writers_schema, decoder) + elif writers_schema.type in ['union', 'error_union']: + return self.skip_union(writers_schema, decoder) + elif writers_schema.type in ['record', 'error', 'request']: + return self.skip_record(writers_schema, decoder) + else: + fail_msg = "Unknown schema type: %s" % writers_schema.type + raise schema.AvroException(fail_msg) + + def read_fixed(self, writers_schema, readers_schema, decoder): + """ + Fixed instances are encoded using the number of bytes declared + in the schema. + """ + return decoder.read(writers_schema.size) + + def skip_fixed(self, writers_schema, decoder): + return decoder.skip(writers_schema.size) + + def read_enum(self, writers_schema, readers_schema, decoder): + """ + An enum is encoded by a int, representing the zero-based position + of the symbol in the schema. + """ + # read data + index_of_symbol = decoder.read_int() + if index_of_symbol >= len(writers_schema.symbols): + fail_msg = "Can't access enum index %d for enum with %d symbols"\ + % (index_of_symbol, len(writers_schema.symbols)) + raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) + read_symbol = writers_schema.symbols[index_of_symbol] + + # schema resolution + if read_symbol not in readers_schema.symbols: + fail_msg = "Symbol %s not present in Reader's Schema" % read_symbol + raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) + + return read_symbol + + def skip_enum(self, writers_schema, decoder): + return decoder.skip_int() + + def read_array(self, writers_schema, readers_schema, decoder): + """ + Arrays are encoded as a series of blocks. + + Each block consists of a long count value, + followed by that many array items. + A block with count zero indicates the end of the array. + Each item is encoded per the array's item schema. + + If a block's count is negative, + then the count is followed immediately by a long block size, + indicating the number of bytes in the block. + The actual count in this case + is the absolute value of the count written. + """ + read_items = [] + block_count = decoder.read_long() + while block_count != 0: + if block_count < 0: + block_count = -block_count + block_size = decoder.read_long() + for i in range(block_count): + read_items.append(self.read_data(writers_schema.items, + readers_schema.items, decoder)) + block_count = decoder.read_long() + return read_items + + def skip_array(self, writers_schema, decoder): + block_count = decoder.read_long() + while block_count != 0: + if block_count < 0: + block_size = decoder.read_long() + decoder.skip(block_size) + else: + for i in range(block_count): + self.skip_data(writers_schema.items, decoder) + block_count = decoder.read_long() + + def read_map(self, writers_schema, readers_schema, decoder): + """ + Maps are encoded as a series of blocks. + + Each block consists of a long count value, + followed by that many key/value pairs. + A block with count zero indicates the end of the map. + Each item is encoded per the map's value schema. + + If a block's count is negative, + then the count is followed immediately by a long block size, + indicating the number of bytes in the block. + The actual count in this case + is the absolute value of the count written. + """ + read_items = {} + block_count = decoder.read_long() + while block_count != 0: + if block_count < 0: + block_count = -block_count + block_size = decoder.read_long() + for i in range(block_count): + key = decoder.read_utf8() + read_items[key] = self.read_data(writers_schema.values, + readers_schema.values, decoder) + block_count = decoder.read_long() + return read_items + + def skip_map(self, writers_schema, decoder): + block_count = decoder.read_long() + while block_count != 0: + if block_count < 0: + block_size = decoder.read_long() + decoder.skip(block_size) + else: + for i in range(block_count): + decoder.skip_utf8() + self.skip_data(writers_schema.values, decoder) + block_count = decoder.read_long() + + def read_union(self, writers_schema, readers_schema, decoder): + """ + A union is encoded by first writing a long value indicating + the zero-based position within the union of the schema of its value. + The value is then encoded per the indicated schema within the union. + """ + # schema resolution + index_of_schema = int(decoder.read_long()) + if index_of_schema >= len(writers_schema.schemas): + fail_msg = "Can't access branch index %d for union with %d branches"\ + % (index_of_schema, len(writers_schema.schemas)) + raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) + selected_writers_schema = writers_schema.schemas[index_of_schema] + + # read data + return self.read_data(selected_writers_schema, readers_schema, decoder) + + def skip_union(self, writers_schema, decoder): + index_of_schema = int(decoder.read_long()) + if index_of_schema >= len(writers_schema.schemas): + fail_msg = "Can't access branch index %d for union with %d branches"\ + % (index_of_schema, len(writers_schema.schemas)) + raise SchemaResolutionException(fail_msg, writers_schema) + return self.skip_data(writers_schema.schemas[index_of_schema], decoder) + + def read_record(self, writers_schema, readers_schema, decoder): + """ + A record is encoded by encoding the values of its fields + in the order that they are declared. In other words, a record + is encoded as just the concatenation of the encodings of its fields. + Field values are encoded per their schema. + + Schema Resolution: + * the ordering of fields may be different: fields are matched by name. + * schemas for fields with the same name in both records are resolved + recursively. + * if the writer's record contains a field with a name not present in the + reader's record, the writer's value for that field is ignored. + * if the reader's record schema has a field that contains a default value, + and writer's schema does not have a field with the same name, then the + reader should use the default value from its field. + * if the reader's record schema has a field with no default value, and + writer's schema does not have a field with the same name, then the + field's value is unset. + """ + # schema resolution + readers_fields_dict = readers_schema.fields_dict + read_record = {} + for field in writers_schema.fields: + readers_field = readers_fields_dict.get(field.name) + if readers_field is not None: + field_val = self.read_data(field.type, readers_field.type, decoder) + read_record[field.name] = field_val + else: + self.skip_data(field.type, decoder) + + # fill in default values + if len(readers_fields_dict) > len(read_record): + writers_fields_dict = writers_schema.fields_dict + for field_name, field in readers_fields_dict.items(): + if not writers_fields_dict.has_key(field_name): + if field.has_default: + field_val = self._read_default_value(field.type, field.default) + read_record[field.name] = field_val + else: + fail_msg = 'No default value for field %s' % field_name + raise SchemaResolutionException(fail_msg, writers_schema, + readers_schema) + return read_record + + def skip_record(self, writers_schema, decoder): + for field in writers_schema.fields: + self.skip_data(field.type, decoder) + + def _read_default_value(self, field_schema, default_value): + """ + Basically a JSON Decoder? + """ + if field_schema.type == 'null': + return None + elif field_schema.type == 'boolean': + return bool(default_value) + elif field_schema.type == 'int': + return int(default_value) + elif field_schema.type == 'long': + return long(default_value) + elif field_schema.type in ['float', 'double']: + return float(default_value) + elif field_schema.type in ['enum', 'fixed', 'string', 'bytes']: + return default_value + elif field_schema.type == 'array': + read_array = [] + for json_val in default_value: + item_val = self._read_default_value(field_schema.items, json_val) + read_array.append(item_val) + return read_array + elif field_schema.type == 'map': + read_map = {} + for key, json_val in default_value.items(): + map_val = self._read_default_value(field_schema.values, json_val) + read_map[key] = map_val + return read_map + elif field_schema.type in ['union', 'error_union']: + return self._read_default_value(field_schema.schemas[0], default_value) + elif field_schema.type == 'record': + read_record = {} + for field in field_schema.fields: + json_val = default_value.get(field.name) + if json_val is None: json_val = field.default + field_val = self._read_default_value(field.type, json_val) + read_record[field.name] = field_val + return read_record + else: + fail_msg = 'Unknown type: %s' % field_schema.type + raise schema.AvroException(fail_msg) + +class DatumWriter(object): + """DatumWriter for generic python objects.""" + def __init__(self, writers_schema=None): + self._writers_schema = writers_schema + + # read/write properties + def set_writers_schema(self, writers_schema): + self._writers_schema = writers_schema + writers_schema = property(lambda self: self._writers_schema, + set_writers_schema) + + def write(self, datum, encoder): + # validate datum + if not validate(self.writers_schema, datum): + raise AvroTypeException(self.writers_schema, datum) + + self.write_data(self.writers_schema, datum, encoder) + + def write_data(self, writers_schema, datum, encoder): + # function dispatch to write datum + if writers_schema.type == 'null': + encoder.write_null(datum) + elif writers_schema.type == 'boolean': + encoder.write_boolean(datum) + elif writers_schema.type == 'string': + encoder.write_utf8(datum) + elif writers_schema.type == 'int': + encoder.write_int(datum) + elif writers_schema.type == 'long': + encoder.write_long(datum) + elif writers_schema.type == 'float': + encoder.write_float(datum) + elif writers_schema.type == 'double': + encoder.write_double(datum) + elif writers_schema.type == 'bytes': + encoder.write_bytes(datum) + elif writers_schema.type == 'fixed': + self.write_fixed(writers_schema, datum, encoder) + elif writers_schema.type == 'enum': + self.write_enum(writers_schema, datum, encoder) + elif writers_schema.type == 'array': + self.write_array(writers_schema, datum, encoder) + elif writers_schema.type == 'map': + self.write_map(writers_schema, datum, encoder) + elif writers_schema.type in ['union', 'error_union']: + self.write_union(writers_schema, datum, encoder) + elif writers_schema.type in ['record', 'error', 'request']: + self.write_record(writers_schema, datum, encoder) + else: + fail_msg = 'Unknown type: %s' % writers_schema.type + raise schema.AvroException(fail_msg) + + def write_fixed(self, writers_schema, datum, encoder): + """ + Fixed instances are encoded using the number of bytes declared + in the schema. + """ + encoder.write(datum) + + def write_enum(self, writers_schema, datum, encoder): + """ + An enum is encoded by a int, representing the zero-based position + of the symbol in the schema. + """ + index_of_datum = writers_schema.symbols.index(datum) + encoder.write_int(index_of_datum) + + def write_array(self, writers_schema, datum, encoder): + """ + Arrays are encoded as a series of blocks. + + Each block consists of a long count value, + followed by that many array items. + A block with count zero indicates the end of the array. + Each item is encoded per the array's item schema. + + If a block's count is negative, + then the count is followed immediately by a long block size, + indicating the number of bytes in the block. + The actual count in this case + is the absolute value of the count written. + """ + if len(datum) > 0: + encoder.write_long(len(datum)) + for item in datum: + self.write_data(writers_schema.items, item, encoder) + encoder.write_long(0) + + def write_map(self, writers_schema, datum, encoder): + """ + Maps are encoded as a series of blocks. + + Each block consists of a long count value, + followed by that many key/value pairs. + A block with count zero indicates the end of the map. + Each item is encoded per the map's value schema. + + If a block's count is negative, + then the count is followed immediately by a long block size, + indicating the number of bytes in the block. + The actual count in this case + is the absolute value of the count written. + """ + if len(datum) > 0: + encoder.write_long(len(datum)) + for key, val in datum.items(): + encoder.write_utf8(key) + self.write_data(writers_schema.values, val, encoder) + encoder.write_long(0) + + def write_union(self, writers_schema, datum, encoder): + """ + A union is encoded by first writing a long value indicating + the zero-based position within the union of the schema of its value. + The value is then encoded per the indicated schema within the union. + """ + # resolve union + index_of_schema = -1 + for i, candidate_schema in enumerate(writers_schema.schemas): + if validate(candidate_schema, datum): + index_of_schema = i + if index_of_schema < 0: raise AvroTypeException(writers_schema, datum) + + # write data + encoder.write_long(index_of_schema) + self.write_data(writers_schema.schemas[index_of_schema], datum, encoder) + + def write_record(self, writers_schema, datum, encoder): + """ + A record is encoded by encoding the values of its fields + in the order that they are declared. In other words, a record + is encoded as just the concatenation of the encodings of its fields. + Field values are encoded per their schema. + """ + for field in writers_schema.fields: + self.write_data(field.type, datum.get(field.name), encoder) diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/ipc.py b/desktop/core/ext-py/avro-1.5.0/src/avro/ipc.py new file mode 100644 index 0000000..321887d --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/ipc.py @@ -0,0 +1,510 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Support for inter-process calls. +""" +import httplib +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from avro import io +from avro import protocol +from avro import schema + +# +# Constants +# + +# Handshake schema is pulled in during build +HANDSHAKE_REQUEST_SCHEMA = schema.parse(""" +{ + "type": "record", + "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc", + "fields": [ + {"name": "clientHash", + "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "clientProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": "MD5"}, + {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]} + ] +} + +""") + +HANDSHAKE_RESPONSE_SCHEMA = schema.parse(""" +{ + "type": "record", + "name": "HandshakeResponse", "namespace": "org.apache.avro.ipc", + "fields": [ + {"name": "match", + "type": {"type": "enum", "name": "HandshakeMatch", + "symbols": ["BOTH", "CLIENT", "NONE"]}}, + {"name": "serverProtocol", + "type": ["null", "string"]}, + {"name": "serverHash", + "type": ["null", {"type": "fixed", "name": "MD5", "size": 16}]}, + {"name": "meta", + "type": ["null", {"type": "map", "values": "bytes"}]} + ] +} + +""") + +HANDSHAKE_REQUESTOR_WRITER = io.DatumWriter(HANDSHAKE_REQUEST_SCHEMA) +HANDSHAKE_REQUESTOR_READER = io.DatumReader(HANDSHAKE_RESPONSE_SCHEMA) +HANDSHAKE_RESPONDER_WRITER = io.DatumWriter(HANDSHAKE_RESPONSE_SCHEMA) +HANDSHAKE_RESPONDER_READER = io.DatumReader(HANDSHAKE_REQUEST_SCHEMA) + +META_SCHEMA = schema.parse('{"type": "map", "values": "bytes"}') +META_WRITER = io.DatumWriter(META_SCHEMA) +META_READER = io.DatumReader(META_SCHEMA) + +SYSTEM_ERROR_SCHEMA = schema.parse('["string"]') + +# protocol cache +REMOTE_HASHES = {} +REMOTE_PROTOCOLS = {} + +BIG_ENDIAN_INT_STRUCT = io.struct_class('!I') +BUFFER_HEADER_LENGTH = 4 +BUFFER_SIZE = 8192 + +# +# Exceptions +# + +class AvroRemoteException(schema.AvroException): + """ + Raised when an error message is sent by an Avro requestor or responder. + """ + def __init__(self, fail_msg=None): + schema.AvroException.__init__(self, fail_msg) + +class ConnectionClosedException(schema.AvroException): + pass + +# +# Base IPC Classes (Requestor/Responder) +# + +class BaseRequestor(object): + """Base class for the client side of a protocol interaction.""" + def __init__(self, local_protocol, transceiver): + self._local_protocol = local_protocol + self._transceiver = transceiver + self._remote_protocol = None + self._remote_hash = None + self._send_protocol = None + + # read-only properties + local_protocol = property(lambda self: self._local_protocol) + transceiver = property(lambda self: self._transceiver) + + # read/write properties + def set_remote_protocol(self, new_remote_protocol): + self._remote_protocol = new_remote_protocol + REMOTE_PROTOCOLS[self.transceiver.remote_name] = self.remote_protocol + remote_protocol = property(lambda self: self._remote_protocol, + set_remote_protocol) + + def set_remote_hash(self, new_remote_hash): + self._remote_hash = new_remote_hash + REMOTE_HASHES[self.transceiver.remote_name] = self.remote_hash + remote_hash = property(lambda self: self._remote_hash, set_remote_hash) + + def set_send_protocol(self, new_send_protocol): + self._send_protocol = new_send_protocol + send_protocol = property(lambda self: self._send_protocol, set_send_protocol) + + def request(self, message_name, request_datum): + """ + Writes a request message and reads a response or error message. + """ + # build handshake and call request + buffer_writer = StringIO() + buffer_encoder = io.BinaryEncoder(buffer_writer) + self.write_handshake_request(buffer_encoder) + self.write_call_request(message_name, request_datum, buffer_encoder) + + # send the handshake and call request; block until call response + call_request = buffer_writer.getvalue() + return self.issue_request(call_request, message_name, request_datum) + + def write_handshake_request(self, encoder): + local_hash = self.local_protocol.md5 + remote_name = self.transceiver.remote_name + remote_hash = REMOTE_HASHES.get(remote_name) + if remote_hash is None: + remote_hash = local_hash + self.remote_protocol = self.local_protocol + request_datum = {} + request_datum['clientHash'] = local_hash + request_datum['serverHash'] = remote_hash + if self.send_protocol: + request_datum['clientProtocol'] = str(self.local_protocol) + HANDSHAKE_REQUESTOR_WRITER.write(request_datum, encoder) + + def write_call_request(self, message_name, request_datum, encoder): + """ + The format of a call request is: + * request metadata, a map with values of type bytes + * the message name, an Avro string, followed by + * the message parameters. Parameters are serialized according to + the message's request declaration. + """ + # request metadata (not yet implemented) + request_metadata = {} + META_WRITER.write(request_metadata, encoder) + + # message name + message = self.local_protocol.messages.get(message_name) + if message is None: + raise schema.AvroException('Unknown message: %s' % message_name) + encoder.write_utf8(message.name) + + # message parameters + self.write_request(message.request, request_datum, encoder) + + def write_request(self, request_schema, request_datum, encoder): + datum_writer = io.DatumWriter(request_schema) + datum_writer.write(request_datum, encoder) + + def read_handshake_response(self, decoder): + handshake_response = HANDSHAKE_REQUESTOR_READER.read(decoder) + match = handshake_response.get('match') + if match == 'BOTH': + self.send_protocol = False + return True + elif match == 'CLIENT': + if self.send_protocol: + raise schema.AvroException('Handshake failure.') + self.remote_protocol = protocol.parse( + handshake_response.get('serverProtocol')) + self.remote_hash = handshake_response.get('serverHash') + self.send_protocol = False + return True + elif match == 'NONE': + if self.send_protocol: + raise schema.AvroException('Handshake failure.') + self.remote_protocol = protocol.parse( + handshake_response.get('serverProtocol')) + self.remote_hash = handshake_response.get('serverHash') + self.send_protocol = True + return False + else: + raise schema.AvroException('Unexpected match: %s' % match) + + def read_call_response(self, message_name, decoder): + """ + The format of a call response is: + * response metadata, a map with values of type bytes + * a one-byte error flag boolean, followed by either: + o if the error flag is false, + the message response, serialized per the message's response schema. + o if the error flag is true, + the error, serialized per the message's error union schema. + """ + # response metadata + response_metadata = META_READER.read(decoder) + + # remote response schema + remote_message_schema = self.remote_protocol.messages.get(message_name) + if remote_message_schema is None: + raise schema.AvroException('Unknown remote message: %s' % message_name) + + # local response schema + local_message_schema = self.local_protocol.messages.get(message_name) + if local_message_schema is None: + raise schema.AvroException('Unknown local message: %s' % message_name) + + # error flag + if not decoder.read_boolean(): + writers_schema = remote_message_schema.response + readers_schema = local_message_schema.response + return self.read_response(writers_schema, readers_schema, decoder) + else: + writers_schema = remote_message_schema.errors + readers_schema = local_message_schema.errors + raise self.read_error(writers_schema, readers_schema, decoder) + + def read_response(self, writers_schema, readers_schema, decoder): + datum_reader = io.DatumReader(writers_schema, readers_schema) + result = datum_reader.read(decoder) + return result + + def read_error(self, writers_schema, readers_schema, decoder): + datum_reader = io.DatumReader(writers_schema, readers_schema) + return AvroRemoteException(datum_reader.read(decoder)) + +class Requestor(BaseRequestor): + + def issue_request(self, call_request, message_name, request_datum): + call_response = self.transceiver.transceive(call_request) + + # process the handshake and call response + buffer_decoder = io.BinaryDecoder(StringIO(call_response)) + call_response_exists = self.read_handshake_response(buffer_decoder) + if call_response_exists: + return self.read_call_response(message_name, buffer_decoder) + else: + return self.request(message_name, request_datum) + +class Responder(object): + """Base class for the server side of a protocol interaction.""" + def __init__(self, local_protocol): + self._local_protocol = local_protocol + self._local_hash = self.local_protocol.md5 + self._protocol_cache = {} + self.set_protocol_cache(self.local_hash, self.local_protocol) + + # read-only properties + local_protocol = property(lambda self: self._local_protocol) + local_hash = property(lambda self: self._local_hash) + protocol_cache = property(lambda self: self._protocol_cache) + + # utility functions to manipulate protocol cache + def get_protocol_cache(self, hash): + return self.protocol_cache.get(hash) + def set_protocol_cache(self, hash, protocol): + self.protocol_cache[hash] = protocol + + def respond(self, call_request): + """ + Called by a server to deserialize a request, compute and serialize + a response or error. Compare to 'handle()' in Thrift. + """ + buffer_reader = StringIO(call_request) + buffer_decoder = io.BinaryDecoder(buffer_reader) + buffer_writer = StringIO() + buffer_encoder = io.BinaryEncoder(buffer_writer) + error = None + response_metadata = {} + + try: + remote_protocol = self.process_handshake(buffer_decoder, buffer_encoder) + # handshake failure + if remote_protocol is None: + return buffer_writer.getvalue() + + # read request using remote protocol + request_metadata = META_READER.read(buffer_decoder) + remote_message_name = buffer_decoder.read_utf8() + + # get remote and local request schemas so we can do + # schema resolution (one fine day) + remote_message = remote_protocol.messages.get(remote_message_name) + if remote_message is None: + fail_msg = 'Unknown remote message: %s' % remote_message_name + raise schema.AvroException(fail_msg) + local_message = self.local_protocol.messages.get(remote_message_name) + if local_message is None: + fail_msg = 'Unknown local message: %s' % remote_message_name + raise schema.AvroException(fail_msg) + writers_schema = remote_message.request + readers_schema = local_message.request + request = self.read_request(writers_schema, readers_schema, + buffer_decoder) + + # perform server logic + try: + response = self.invoke(local_message, request) + except AvroRemoteException, e: + error = e + except Exception, e: + error = AvroRemoteException(str(e)) + + # write response using local protocol + META_WRITER.write(response_metadata, buffer_encoder) + buffer_encoder.write_boolean(error is not None) + if error is None: + writers_schema = local_message.response + self.write_response(writers_schema, response, buffer_encoder) + else: + writers_schema = local_message.errors + self.write_error(writers_schema, error, buffer_encoder) + except schema.AvroException, e: + error = AvroRemoteException(str(e)) + buffer_encoder = io.BinaryEncoder(StringIO()) + META_WRITER.write(response_metadata, buffer_encoder) + buffer_encoder.write_boolean(True) + self.write_error(SYSTEM_ERROR_SCHEMA, error, buffer_encoder) + return buffer_writer.getvalue() + + def process_handshake(self, decoder, encoder): + handshake_request = HANDSHAKE_RESPONDER_READER.read(decoder) + handshake_response = {} + + # determine the remote protocol + client_hash = handshake_request.get('clientHash') + client_protocol = handshake_request.get('clientProtocol') + remote_protocol = self.get_protocol_cache(client_hash) + if remote_protocol is None and client_protocol is not None: + remote_protocol = protocol.parse(client_protocol) + self.set_protocol_cache(client_hash, remote_protocol) + + # evaluate remote's guess of the local protocol + server_hash = handshake_request.get('serverHash') + if self.local_hash == server_hash: + if remote_protocol is None: + handshake_response['match'] = 'NONE' + else: + handshake_response['match'] = 'BOTH' + else: + if remote_protocol is None: + handshake_response['match'] = 'NONE' + else: + handshake_response['match'] = 'CLIENT' + + if handshake_response['match'] != 'BOTH': + handshake_response['serverProtocol'] = str(self.local_protocol) + handshake_response['serverHash'] = self.local_hash + + HANDSHAKE_RESPONDER_WRITER.write(handshake_response, encoder) + return remote_protocol + + def invoke(self, local_message, request): + """ + Aactual work done by server: cf. handler in thrift. + """ + pass + + def read_request(self, writers_schema, readers_schema, decoder): + datum_reader = io.DatumReader(writers_schema, readers_schema) + return datum_reader.read(decoder) + + def write_response(self, writers_schema, response_datum, encoder): + datum_writer = io.DatumWriter(writers_schema) + datum_writer.write(response_datum, encoder) + + def write_error(self, writers_schema, error_exception, encoder): + datum_writer = io.DatumWriter(writers_schema) + datum_writer.write(str(error_exception), encoder) + +# +# Utility classes +# + +class FramedReader(object): + """Wrapper around a file-like object to read framed data.""" + def __init__(self, reader): + self._reader = reader + + # read-only properties + reader = property(lambda self: self._reader) + + def read_framed_message(self): + message = [] + while True: + buffer = StringIO() + buffer_length = self._read_buffer_length() + if buffer_length == 0: + return ''.join(message) + while buffer.tell() < buffer_length: + chunk = self.reader.read(buffer_length - buffer.tell()) + if chunk == '': + raise ConnectionClosedException("Reader read 0 bytes.") + buffer.write(chunk) + message.append(buffer.getvalue()) + + def _read_buffer_length(self): + read = self.reader.read(BUFFER_HEADER_LENGTH) + if read == '': + raise ConnectionClosedException("Reader read 0 bytes.") + return BIG_ENDIAN_INT_STRUCT.unpack(read)[0] + +class FramedWriter(object): + """Wrapper around a file-like object to write framed data.""" + def __init__(self, writer): + self._writer = writer + + # read-only properties + writer = property(lambda self: self._writer) + + def write_framed_message(self, message): + message_length = len(message) + total_bytes_sent = 0 + while message_length - total_bytes_sent > 0: + if message_length - total_bytes_sent > BUFFER_SIZE: + buffer_length = BUFFER_SIZE + else: + buffer_length = message_length - total_bytes_sent + self.write_buffer(message[total_bytes_sent: + (total_bytes_sent + buffer_length)]) + total_bytes_sent += buffer_length + # A message is always terminated by a zero-length buffer. + self.write_buffer_length(0) + + def write_buffer(self, chunk): + buffer_length = len(chunk) + self.write_buffer_length(buffer_length) + self.writer.write(chunk) + + def write_buffer_length(self, n): + self.writer.write(BIG_ENDIAN_INT_STRUCT.pack(n)) + +# +# Transceiver Implementations +# + +class HTTPTransceiver(object): + """ + A simple HTTP-based transceiver implementation. + Useful for clients but not for servers + """ + def __init__(self, host, port): + self.conn = httplib.HTTPConnection(host, port) + self.conn.connect() + + # read-only properties + sock = property(lambda self: self.conn.sock) + remote_name = property(lambda self: self.sock.getsockname()) + + # read/write properties + def set_conn(self, new_conn): + self._conn = new_conn + conn = property(lambda self: self._conn, set_conn) + + def transceive(self, request): + self.write_framed_message(request) + result = self.read_framed_message() + return result + + def read_framed_message(self): + response = self.conn.getresponse() + response_reader = FramedReader(response) + framed_message = response_reader.read_framed_message() + response.read() # ensure we're ready for subsequent requests + return framed_message + + def write_framed_message(self, message): + req_method = 'POST' + req_resource = '/' + req_headers = {'Content-Type': 'avro/binary'} + + req_body_buffer = FramedWriter(StringIO()) + req_body_buffer.write_framed_message(message) + req_body = req_body_buffer.writer.getvalue() + + self.conn.request(req_method, req_resource, req_body, req_headers) + + def close(self): + self.conn.close() + +# +# Server Implementations (none yet) +# + diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/protocol.py b/desktop/core/ext-py/avro-1.5.0/src/avro/protocol.py new file mode 100644 index 0000000..104817e --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/protocol.py @@ -0,0 +1,222 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Protocol implementation. +""" +try: + from hashlib import md5 +except ImportError: + from md5 import md5 +try: + import json +except ImportError: + import simplejson as json +from avro import schema + +# +# Constants +# + +# TODO(hammer): confirmed 'fixed' with Doug +VALID_TYPE_SCHEMA_TYPES = ('enum', 'record', 'error', 'fixed') + +# +# Exceptions +# + +class ProtocolParseException(schema.AvroException): + pass + +# +# Base Classes +# + +class Protocol(object): + """An application protocol.""" + def _parse_types(self, types, type_names): + type_objects = [] + for type in types: + type_object = schema.make_avsc_object(type, type_names) + if type_object.type not in VALID_TYPE_SCHEMA_TYPES: + fail_msg = 'Type %s not an enum, fixed, record, or error.' % type + raise ProtocolParseException(fail_msg) + type_objects.append(type_object) + return type_objects + + def _parse_messages(self, messages, names): + message_objects = {} + for name, body in messages.iteritems(): + if message_objects.has_key(name): + fail_msg = 'Message name "%s" repeated.' % name + raise ProtocolParseException(fail_msg) + elif not(hasattr(body, 'get') and callable(body.get)): + fail_msg = 'Message name "%s" has non-object body %s.' % (name, body) + raise ProtocolParseException(fail_msg) + request = body.get('request') + response = body.get('response') + errors = body.get('errors') + message_objects[name] = Message(name, request, response, errors, names) + return message_objects + + def __init__(self, name, namespace=None, types=None, messages=None): + # Ensure valid ctor args + if not name: + fail_msg = 'Protocols must have a non-empty name.' + raise ProtocolParseException(fail_msg) + elif not isinstance(name, basestring): + fail_msg = 'The name property must be a string.' + raise ProtocolParseException(fail_msg) + elif namespace is not None and not isinstance(namespace, basestring): + fail_msg = 'The namespace property must be a string.' + raise ProtocolParseException(fail_msg) + elif types is not None and not isinstance(types, list): + fail_msg = 'The types property must be a list.' + raise ProtocolParseException(fail_msg) + elif (messages is not None and + not(hasattr(messages, 'get') and callable(messages.get))): + fail_msg = 'The messages property must be a JSON object.' + raise ProtocolParseException(fail_msg) + + self._props = {} + self.set_prop('name', name) + type_names = schema.Names() + if namespace is not None: + self.set_prop('namespace', namespace) + type_names.default_namespace = namespace + if types is not None: + self.set_prop('types', self._parse_types(types, type_names)) + if messages is not None: + self.set_prop('messages', self._parse_messages(messages, type_names)) + self._md5 = md5(str(self)).digest() + + # read-only properties + name = property(lambda self: self.get_prop('name')) + namespace = property(lambda self: self.get_prop('namespace')) + fullname = property(lambda self: + schema.Name(self.name, self.namespace).fullname) + types = property(lambda self: self.get_prop('types')) + types_dict = property(lambda self: dict([(type.name, type) + for type in self.types])) + messages = property(lambda self: self.get_prop('messages')) + md5 = property(lambda self: self._md5) + props = property(lambda self: self._props) + + # utility functions to manipulate properties dict + def get_prop(self, key): + return self.props.get(key) + def set_prop(self, key, value): + self.props[key] = value + + def to_json(self): + to_dump = {} + to_dump['protocol'] = self.name + names = schema.Names() + if self.namespace: + to_dump['namespace'] = self.namespace + if self.types: + to_dump['types'] = [ t.to_json(names) for t in self.types ] + if self.messages: + messages_dict = {} + for name, body in self.messages.iteritems(): + messages_dict[name] = body.to_json(names) + to_dump['messages'] = messages_dict + return to_dump + + def __str__(self): + return json.dumps(self.to_json()) + + def __eq__(self, that): + to_cmp = json.loads(str(self)) + return to_cmp == json.loads(str(that)) + +class Message(object): + """A Protocol message.""" + def _parse_request(self, request, names): + if not isinstance(request, list): + fail_msg = 'Request property not a list: %s' % request + raise ProtocolParseException(fail_msg) + return schema.RecordSchema(None, None, request, names, 'request') + + def _parse_response(self, response, names): + if isinstance(response, basestring) and names.has_name(response, None): + return names.get_name(response, None) + else: + return schema.make_avsc_object(response, names) + + def _parse_errors(self, errors, names): + if not isinstance(errors, list): + fail_msg = 'Errors property not a list: %s' % errors + raise ProtocolParseException(fail_msg) + errors_for_parsing = {'type': 'error_union', 'declared_errors': errors} + return schema.make_avsc_object(errors_for_parsing, names) + + def __init__(self, name, request, response, errors=None, names=None): + self._name = name + + self._props = {} + self.set_prop('request', self._parse_request(request, names)) + self.set_prop('response', self._parse_response(response, names)) + if errors is not None: + self.set_prop('errors', self._parse_errors(errors, names)) + + # read-only properties + name = property(lambda self: self._name) + request = property(lambda self: self.get_prop('request')) + response = property(lambda self: self.get_prop('response')) + errors = property(lambda self: self.get_prop('errors')) + props = property(lambda self: self._props) + + # utility functions to manipulate properties dict + def get_prop(self, key): + return self.props.get(key) + def set_prop(self, key, value): + self.props[key] = value + + def __str__(self): + return json.dumps(self.to_json(schema.Names())) + + def to_json(self, names): + to_dump = {} + to_dump['request'] = self.request.to_json(names) + to_dump['response'] = self.response.to_json(names) + if self.errors: + to_dump['errors'] = self.errors.to_json(names) + return to_dump + + def __eq__(self, that): + return self.name == that.name and self.props == that.props + +def make_avpr_object(json_data): + """Build Avro Protocol from data parsed out of JSON string.""" + if hasattr(json_data, 'get') and callable(json_data.get): + name = json_data.get('protocol') + namespace = json_data.get('namespace') + types = json_data.get('types') + messages = json_data.get('messages') + return Protocol(name, namespace, types, messages) + else: + raise ProtocolParseException('Not a JSON object: %s' % json_data) + +def parse(json_string): + """Constructs the Protocol from the JSON text.""" + try: + json_data = json.loads(json_string) + except: + raise ProtocolParseException('Error parsing JSON: %s' % json_string) + + # construct the Avro Protocol object + return make_avpr_object(json_data) + diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/schema.py b/desktop/core/ext-py/avro-1.5.0/src/avro/schema.py new file mode 100644 index 0000000..24718c0 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/schema.py @@ -0,0 +1,707 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Contains the Schema classes. + +A schema may be one of: + A record, mapping field names to field value data; + An error, equivalent to a record; + An enum, containing one of a small set of symbols; + An array of values, all of the same schema; + A map containing string/value pairs, each of a declared schema; + A union of other schemas; + A fixed sized binary object; + A unicode string; + A sequence of bytes; + A 32-bit signed int; + A 64-bit signed long; + A 32-bit floating-point float; + A 64-bit floating-point double; + A boolean; or + Null. +""" +try: + import json +except ImportError: + import simplejson as json + +# +# Constants +# + +PRIMITIVE_TYPES = ( + 'null', + 'boolean', + 'string', + 'bytes', + 'int', + 'long', + 'float', + 'double', +) + +NAMED_TYPES = ( + 'fixed', + 'enum', + 'record', + 'error', +) + +VALID_TYPES = PRIMITIVE_TYPES + NAMED_TYPES + ( + 'array', + 'map', + 'union', + 'request', + 'error_union' +) + +RESERVED_PROPS = ( + 'type', + 'name', + 'namespace', + 'fields', # Record + 'items', # Array + 'size', # Fixed + 'symbols', # Enum + 'values', # Map +) + +VALID_FIELD_SORT_ORDERS = ( + 'ascending', + 'descending', + 'ignore', +) + +# +# Exceptions +# + +class AvroException(Exception): + pass + +class SchemaParseException(AvroException): + pass + +# +# Base Classes +# + +class Schema(object): + """Base class for all Schema classes.""" + def __init__(self, type): + # Ensure valid ctor args + if not isinstance(type, basestring): + fail_msg = 'Schema type must be a string.' + raise SchemaParseException(fail_msg) + elif type not in VALID_TYPES: + fail_msg = '%s is not a valid type.' % type + raise SchemaParseException(fail_msg) + + # add members + if not hasattr(self, '_props'): self._props = {} + self.set_prop('type', type) + + # Read-only properties dict. Printing schemas + # creates JSON properties directly from this dict. + props = property(lambda self: self._props) + type = property(lambda self: self.get_prop('type')) + + # utility functions to manipulate properties dict + def get_prop(self, key): + return self.props.get(key) + + def set_prop(self, key, value): + self.props[key] = value + + def __str__(self): + names = Names() + return json.dumps(self.to_json(names)) + + def to_json(self, names): + """ + Converts the schema object into its AVRO specification representation. + + Schema types that have names (records, enums, and fixed) must + be aware of not re-defining schemas that are already listed + in the parameter names. + """ + raise Exception("Must be implemented by subclasses.") + +class Name(object): + """Class to describe Avro name.""" + + def __init__(self, name_attr, space_attr, default_space): + """ + Formulate full name according to the specification. + + @arg name_attr: name value read in schema or None. + @arg space_attr: namespace value read in schema or None. + @ard default_space: the current default space or None. + """ + # Ensure valid ctor args + if not (isinstance(name_attr, basestring) or (name_attr is None)): + fail_msg = 'Name must be non-empty string or None.' + raise SchemaParseException(fail_msg) + elif name_attr == "": + fail_msg = 'Name must be non-empty string or None.' + raise SchemaParseException(fail_msg) + + if not (isinstance(space_attr, basestring) or (space_attr is None)): + fail_msg = 'Space must be non-empty string or None.' + raise SchemaParseException(fail_msg) + elif name_attr == "": + fail_msg = 'Space must be non-empty string or None.' + raise SchemaParseException(fail_msg) + + if not (isinstance(default_space, basestring) or (default_space is None)): + fail_msg = 'Default space must be non-empty string or None.' + raise SchemaParseException(fail_msg) + elif name_attr == "": + fail_msg = 'Default must be non-empty string or None.' + raise SchemaParseException(fail_msg) + + self._full = None; + + if name_attr is None or name_attr == "": + return; + + if (name_attr.find('.') < 0): + if (space_attr is not None) and (space_attr != ""): + self._full = "%s.%s" % (space_attr, name_attr) + else: + if (default_space is not None) and (default_space != ""): + self._full = "%s.%s" % (default_space, name_attr) + else: + self._full = name_attr + else: + self._full = name_attr + + def __eq__(self, other): + if not isinstance(other, Name): + return False + return (self.fullname == other.fullname) + + fullname = property(lambda self: self._full) + + def get_space(self): + """Back out a namespace from full name.""" + if self._full is None: + return None + + if (self._full.find('.') > 0): + return self._full.rsplit(".", 1)[0] + else: + return "" + +class Names(object): + """Track name set and default namespace during parsing.""" + def __init__(self, default_namespace=None): + self.names = {} + self.default_namespace = default_namespace + + def has_name(self, name_attr, space_attr): + test = Name(name_attr, space_attr, self.default_namespace).fullname + return self.names.has_key(test) + + def get_name(self, name_attr, space_attr): + test = Name(name_attr, space_attr, self.default_namespace).fullname + if not self.names.has_key(test): + return None + return self.names[test] + + def add_name(self, name_attr, space_attr, new_schema): + """ + Add a new schema object to the name set. + + @arg name_attr: name value read in schema + @arg space_attr: namespace value read in schema. + + @return: the Name that was just added. + """ + to_add = Name(name_attr, space_attr, self.default_namespace) + + if to_add.fullname in VALID_TYPES: + fail_msg = '%s is a reserved type name.' % to_add.fullname + raise SchemaParseException(fail_msg) + elif self.names.has_key(to_add.fullname): + fail_msg = 'The name "%s" is already in use.' % to_add.fullname + raise SchemaParseException(fail_msg) + + self.names[to_add.fullname] = new_schema + return to_add + +class NamedSchema(Schema): + """Named Schemas specified in NAMED_TYPES.""" + def __init__(self, type, name, namespace=None, names=None): + # Ensure valid ctor args + if not name: + fail_msg = 'Named Schemas must have a non-empty name.' + raise SchemaParseException(fail_msg) + elif not isinstance(name, basestring): + fail_msg = 'The name property must be a string.' + raise SchemaParseException(fail_msg) + elif namespace is not None and not isinstance(namespace, basestring): + fail_msg = 'The namespace property must be a string.' + raise SchemaParseException(fail_msg) + + # Call parent ctor + Schema.__init__(self, type) + + # Add class members + new_name = names.add_name(name, namespace, self) + + # Store name and namespace as they were read in origin schema + self.set_prop('name', name) + if namespace is not None: + self.set_prop('namespace', new_name.get_space()) + + # Store full name as calculated from name, namespace + self._fullname = new_name.fullname + + def name_ref(self, names): + if self.namespace == names.default_namespace: + return self.name + else: + return self.fullname + + # read-only properties + name = property(lambda self: self.get_prop('name')) + namespace = property(lambda self: self.get_prop('namespace')) + fullname = property(lambda self: self._fullname) + +class Field(object): + def __init__(self, type, name, has_default, default=None, order=None, names=None): + # Ensure valid ctor args + if not name: + fail_msg = 'Fields must have a non-empty name.' + raise SchemaParseException(fail_msg) + elif not isinstance(name, basestring): + fail_msg = 'The name property must be a string.' + raise SchemaParseException(fail_msg) + elif order is not None and order not in VALID_FIELD_SORT_ORDERS: + fail_msg = 'The order property %s is not valid.' % order + raise SchemaParseException(fail_msg) + + # add members + self._props = {} + self._has_default = has_default + + if (isinstance(type, basestring) and names is not None + and names.has_name(type, None)): + type_schema = names.get_name(type, None) + else: + try: + type_schema = make_avsc_object(type, names) + except Exception, e: + fail_msg = 'Type property "%s" not a valid Avro schema: %s' % (type, e) + raise SchemaParseException(fail_msg) + self.set_prop('type', type_schema) + self.set_prop('name', name) + # TODO(hammer): check to ensure default is valid + if has_default: self.set_prop('default', default) + if order is not None: self.set_prop('order', order) + + # read-only properties + type = property(lambda self: self.get_prop('type')) + name = property(lambda self: self.get_prop('name')) + default = property(lambda self: self.get_prop('default')) + has_default = property(lambda self: self._has_default) + order = property(lambda self: self.get_prop('order')) + props = property(lambda self: self._props) + + # utility functions to manipulate properties dict + def get_prop(self, key): + return self.props.get(key) + def set_prop(self, key, value): + self.props[key] = value + + def to_json(self, names): + to_dump = self.props.copy() + to_dump['type'] = self.type.to_json(names) + return to_dump + + def __eq__(self, that): + to_cmp = json.loads(str(self)) + return to_cmp == json.loads(str(that)) + +# +# Primitive Types +# +class PrimitiveSchema(Schema): + """Valid primitive types are in PRIMITIVE_TYPES.""" + def __init__(self, type): + # Ensure valid ctor args + if type not in PRIMITIVE_TYPES: + raise AvroException("%s is not a valid primitive type." % type) + + # Call parent ctor + Schema.__init__(self, type) + + self.fullname = type + + def to_json(self, names): + if len(self.props) == 1: + return self.fullname + else: + return self.props + + def __eq__(self, that): + return self.props == that.props + +# +# Complex Types (non-recursive) +# + +class FixedSchema(NamedSchema): + def __init__(self, name, namespace, size, names=None): + # Ensure valid ctor args + if not isinstance(size, int): + fail_msg = 'Fixed Schema requires a valid integer for size property.' + raise AvroException(fail_msg) + + # Call parent ctor + NamedSchema.__init__(self, 'fixed', name, namespace, names) + + # Add class members + self.set_prop('size', size) + + # read-only properties + size = property(lambda self: self.get_prop('size')) + + def to_json(self, names): + if self.fullname in names.names: + return self.name_ref(names) + else: + names.names[self.fullname] = self + return self.props + + def __eq__(self, that): + return self.props == that.props + +class EnumSchema(NamedSchema): + def __init__(self, name, namespace, symbols, names=None): + # Ensure valid ctor args + if not isinstance(symbols, list): + fail_msg = 'Enum Schema requires a JSON array for the symbols property.' + raise AvroException(fail_msg) + elif False in [isinstance(s, basestring) for s in symbols]: + fail_msg = 'Enum Schems requires All symbols to be JSON strings.' + raise AvroException(fail_msg) + elif len(set(symbols)) < len(symbols): + fail_msg = 'Duplicate symbol: %s' % symbols + raise AvroException(fail_msg) + + # Call parent ctor + NamedSchema.__init__(self, 'enum', name, namespace, names) + + # Add class members + self.set_prop('symbols', symbols) + + # read-only properties + symbols = property(lambda self: self.get_prop('symbols')) + + def to_json(self, names): + if self.fullname in names.names: + return self.name_ref(names) + else: + names.names[self.fullname] = self + return self.props + + def __eq__(self, that): + return self.props == that.props + +# +# Complex Types (recursive) +# + +class ArraySchema(Schema): + def __init__(self, items, names=None): + # Call parent ctor + Schema.__init__(self, 'array') + # Add class members + + if isinstance(items, basestring) and names.has_name(items, None): + items_schema = names.get_name(items, None) + else: + try: + items_schema = make_avsc_object(items, names) + except SchemaParseException, e: + fail_msg = 'Items schema (%s) not a valid Avro schema: %s (known names: %s)' % (items, e, names.names.keys()) + raise SchemaParseException(fail_msg) + + self.set_prop('items', items_schema) + + # read-only properties + items = property(lambda self: self.get_prop('items')) + + def to_json(self, names): + to_dump = self.props.copy() + item_schema = self.get_prop('items') + to_dump['items'] = item_schema.to_json(names) + return to_dump + + def __eq__(self, that): + to_cmp = json.loads(str(self)) + return to_cmp == json.loads(str(that)) + +class MapSchema(Schema): + def __init__(self, values, names=None): + # Call parent ctor + Schema.__init__(self, 'map') + + # Add class members + if isinstance(values, basestring) and names.has_name(values, None): + values_schema = names.get_name(values, None) + else: + try: + values_schema = make_avsc_object(values, names) + except: + fail_msg = 'Values schema not a valid Avro schema.' + raise SchemaParseException(fail_msg) + + self.set_prop('values', values_schema) + + # read-only properties + values = property(lambda self: self.get_prop('values')) + + def to_json(self, names): + to_dump = self.props.copy() + to_dump['values'] = self.get_prop('values').to_json(names) + return to_dump + + def __eq__(self, that): + to_cmp = json.loads(str(self)) + return to_cmp == json.loads(str(that)) + +class UnionSchema(Schema): + """ + names is a dictionary of schema objects + """ + def __init__(self, schemas, names=None): + # Ensure valid ctor args + if not isinstance(schemas, list): + fail_msg = 'Union schema requires a list of schemas.' + raise SchemaParseException(fail_msg) + + # Call parent ctor + Schema.__init__(self, 'union') + + # Add class members + schema_objects = [] + for schema in schemas: + if isinstance(schema, basestring) and names.has_name(schema, None): + new_schema = names.get_name(schema, None) + else: + try: + new_schema = make_avsc_object(schema, names) + except Exception, e: + raise SchemaParseException('Union item must be a valid Avro schema: %s' % str(e)) + # check the new schema + if (new_schema.type in VALID_TYPES and new_schema.type not in NAMED_TYPES + and new_schema.type in [schema.type for schema in schema_objects]): + raise SchemaParseException('%s type already in Union' % new_schema.type) + elif new_schema.type == 'union': + raise SchemaParseException('Unions cannot contain other unions.') + else: + schema_objects.append(new_schema) + self._schemas = schema_objects + + # read-only properties + schemas = property(lambda self: self._schemas) + + def to_json(self, names): + to_dump = [] + for schema in self.schemas: + to_dump.append(schema.to_json(names)) + return to_dump + + def __eq__(self, that): + to_cmp = json.loads(str(self)) + return to_cmp == json.loads(str(that)) + +class ErrorUnionSchema(UnionSchema): + def __init__(self, schemas, names=None): + # Prepend "string" to handle system errors + UnionSchema.__init__(self, ['string'] + schemas, names) + + def to_json(self, names): + to_dump = [] + for schema in self.schemas: + # Don't print the system error schema + if schema.type == 'string': continue + to_dump.append(schema.to_json(names)) + return to_dump + +class RecordSchema(NamedSchema): + @staticmethod + def make_field_objects(field_data, names): + """We're going to need to make message parameters too.""" + field_objects = [] + field_names = [] + for i, field in enumerate(field_data): + if hasattr(field, 'get') and callable(field.get): + type = field.get('type') + name = field.get('name') + + # null values can have a default value of None + has_default = False + default = None + if field.has_key('default'): + has_default = True + default = field.get('default') + + order = field.get('order') + new_field = Field(type, name, has_default, default, order, names) + # make sure field name has not been used yet + if new_field.name in field_names: + fail_msg = 'Field name %s already in use.' % new_field.name + raise SchemaParseException(fail_msg) + field_names.append(new_field.name) + else: + raise SchemaParseException('Not a valid field: %s' % field) + field_objects.append(new_field) + return field_objects + + def __init__(self, name, namespace, fields, names=None, schema_type='record'): + # Ensure valid ctor args + if fields is None: + fail_msg = 'Record schema requires a non-empty fields property.' + raise SchemaParseException(fail_msg) + elif not isinstance(fields, list): + fail_msg = 'Fields property must be a list of Avro schemas.' + raise SchemaParseException(fail_msg) + + # Call parent ctor (adds own name to namespace, too) + if schema_type == 'request': + Schema.__init__(self, schema_type) + else: + NamedSchema.__init__(self, schema_type, name, namespace, names) + + if schema_type == 'record': + old_default = names.default_namespace + names.default_namespace = Name(name, namespace, + names.default_namespace).get_space() + + # Add class members + field_objects = RecordSchema.make_field_objects(fields, names) + self.set_prop('fields', field_objects) + + if schema_type == 'record': + names.default_namespace = old_default + + # read-only properties + fields = property(lambda self: self.get_prop('fields')) + + @property + def fields_dict(self): + fields_dict = {} + for field in self.fields: + fields_dict[field.name] = field + return fields_dict + + def to_json(self, names): + # Request records don't have names + if self.type == 'request': + return [ f.to_json(names) for f in self.fields ] + + if self.fullname in names.names: + return self.name_ref(names) + else: + names.names[self.fullname] = self + + to_dump = self.props.copy() + to_dump['fields'] = [ f.to_json(names) for f in self.fields ] + return to_dump + + def __eq__(self, that): + to_cmp = json.loads(str(self)) + return to_cmp == json.loads(str(that)) + +# +# Module Methods +# + +# TODO(hammer): handle non-reserved properties +def make_avsc_object(json_data, names=None): + """ + Build Avro Schema from data parsed out of JSON string. + + @arg names: A Name object (tracks seen names and default space) + """ + if names == None: + names = Names() + + # JSON object (non-union) + if hasattr(json_data, 'get') and callable(json_data.get): + type = json_data.get('type') + if type in PRIMITIVE_TYPES: + return PrimitiveSchema(type) + elif type in NAMED_TYPES: + name = json_data.get('name') + namespace = json_data.get('namespace') + if type == 'fixed': + size = json_data.get('size') + return FixedSchema(name, namespace, size, names) + elif type == 'enum': + symbols = json_data.get('symbols') + return EnumSchema(name, namespace, symbols, names) + elif type in ['record', 'error']: + fields = json_data.get('fields') + return RecordSchema(name, namespace, fields, names, type) + else: + raise SchemaParseException('Unknown Named Type: %s' % type) + elif type in VALID_TYPES: + if type == 'array': + items = json_data.get('items') + return ArraySchema(items, names) + elif type == 'map': + values = json_data.get('values') + return MapSchema(values, names) + elif type == 'error_union': + declared_errors = json_data.get('declared_errors') + return ErrorUnionSchema(declared_errors, names) + else: + raise SchemaParseException('Unknown Valid Type: %s' % type) + elif type is None: + raise SchemaParseException('No "type" property: %s' % json_data) + else: + raise SchemaParseException('Undefined type: %s' % type) + # JSON array (union) + elif isinstance(json_data, list): + return UnionSchema(json_data, names) + # JSON string (primitive) + elif json_data in PRIMITIVE_TYPES: + return PrimitiveSchema(json_data) + # not for us! + else: + fail_msg = "Could not make an Avro Schema object from %s." % json_data + raise SchemaParseException(fail_msg) + +# TODO(hammer): make method for reading from a file? +def parse(json_string): + """Constructs the Schema from the JSON text.""" + # TODO(hammer): preserve stack trace from JSON parse + # parse the JSON + try: + json_data = json.loads(json_string) + except: + raise SchemaParseException('Error parsing JSON: %s' % json_string) + + # Initialize the names object + names = Names() + + # construct the Avro Schema object + return make_avsc_object(json_data, names) diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/tool.py b/desktop/core/ext-py/avro-1.5.0/src/avro/tool.py new file mode 100644 index 0000000..edd6f18 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/tool.py @@ -0,0 +1,160 @@ +#! /usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Command-line tool + +NOTE: The API for the command-line tool is experimental. +""" +import sys +from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler +import urlparse +from avro import io +from avro import datafile +from avro import protocol +from avro import ipc + +class GenericResponder(ipc.Responder): + def __init__(self, proto, msg, datum): + proto_json = file(proto, 'r').read() + ipc.Responder.__init__(self, protocol.parse(proto_json)) + self.msg = msg + self.datum = datum + + def invoke(self, message, request): + if message.name == self.msg: + print >> sys.stderr, "Message: %s Datum: %s" % (message.name, self.datum) + # server will shut down after processing a single Avro request + global server_should_shutdown + server_should_shutdown = True + return self.datum + +class GenericHandler(BaseHTTPRequestHandler): + def do_POST(self): + self.responder = responder + call_request_reader = ipc.FramedReader(self.rfile) + call_request = call_request_reader.read_framed_message() + resp_body = self.responder.respond(call_request) + self.send_response(200) + self.send_header('Content-Type', 'avro/binary') + self.end_headers() + resp_writer = ipc.FramedWriter(self.wfile) + resp_writer.write_framed_message(resp_body) + if server_should_shutdown: + print >> sys.stderr, "Shutting down server." + self.server.force_stop() + +class StoppableHTTPServer(HTTPServer): + """HTTPServer.shutdown added in Python 2.6. FML.""" + stopped = False + allow_reuse_address = True + def __init__(self, *args, **kw): + HTTPServer.__init__(self, *args, **kw) + self.allow_reuse_address = True + + def serve_forever(self): + while not self.stopped: + self.handle_request() + + def force_stop(self): + self.server_close() + self.stopped = True + self.serve_forever() + +def run_server(uri, proto, msg, datum): + url_obj = urlparse.urlparse(uri) + server_addr = (url_obj.hostname, url_obj.port) + global responder + global server_should_shutdown + server_should_shutdown = False + responder = GenericResponder(proto, msg, datum) + server = StoppableHTTPServer(server_addr, GenericHandler) + print "Port: %s" % server.server_port + sys.stdout.flush() + server.allow_reuse_address = True + print >> sys.stderr, "Starting server." + server.serve_forever() + +def send_message(uri, proto, msg, datum): + url_obj = urlparse.urlparse(uri) + client = ipc.HTTPTransceiver(url_obj.hostname, url_obj.port) + proto_json = file(proto, 'r').read() + requestor = ipc.Requestor(protocol.parse(proto_json), client) + print requestor.request(msg, datum) + +def file_or_stdin(f): + if f == "-": + return sys.stdin + else: + return file(f) + +def main(args=sys.argv): + if len(args) == 1: + print "Usage: %s [dump|rpcreceive|rpcsend]" % args[0] + return 1 + + if args[1] == "dump": + if len(args) != 3: + print "Usage: %s dump input_file" % args[0] + return 1 + for d in datafile.DataFileReader(file_or_stdin(args[2]), io.DatumReader()): + print repr(d) + elif args[1] == "rpcreceive": + usage_str = "Usage: %s rpcreceive uri protocol_file " % args[0] + usage_str += "message_name (-data d | -file f)" + if len(args) not in [5, 7]: + print usage_str + return 1 + uri, proto, msg = args[2:5] + datum = None + if len(args) > 5: + if args[5] == "-file": + reader = open(args[6], 'rb') + datum_reader = io.DatumReader() + dfr = datafile.DataFileReader(reader, datum_reader) + datum = dfr.next() + elif args[5] == "-data": + print "JSON Decoder not yet implemented." + return 1 + else: + print usage_str + return 1 + run_server(uri, proto, msg, datum) + elif args[1] == "rpcsend": + usage_str = "Usage: %s rpcsend uri protocol_file " % args[0] + usage_str += "message_name (-data d | -file f)" + if len(args) not in [5, 7]: + print usage_str + return 1 + uri, proto, msg = args[2:5] + datum = None + if len(args) > 5: + if args[5] == "-file": + reader = open(args[6], 'rb') + datum_reader = io.DatumReader() + dfr = datafile.DataFileReader(reader, datum_reader) + datum = dfr.next() + elif args[5] == "-data": + print "JSON Decoder not yet implemented." + return 1 + else: + print usage_str + return 1 + send_message(uri, proto, msg, datum) + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/desktop/core/ext-py/avro-1.5.0/src/avro/txipc.py b/desktop/core/ext-py/avro-1.5.0/src/avro/txipc.py new file mode 100644 index 0000000..6a4d8b7 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/src/avro/txipc.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from avro import ipc +from avro import io + +from zope.interface import implements + +from twisted.web.client import Agent +from twisted.web.http_headers import Headers +from twisted.internet.defer import maybeDeferred, Deferred +from twisted.web.iweb import IBodyProducer +from twisted.web import resource, server +from twisted.internet.protocol import Protocol + +class TwistedRequestor(ipc.BaseRequestor): + """A Twisted-compatible requestor. Returns a Deferred that will fire with the + returning value, instead of blocking until the request completes.""" + def _process_handshake(self, call_response, message_name, request_datum): + # process the handshake and call response + buffer_decoder = io.BinaryDecoder(StringIO(call_response)) + call_response_exists = self.read_handshake_response(buffer_decoder) + if call_response_exists: + return self.read_call_response(message_name, buffer_decoder) + else: + return self.request(message_name, request_datum) + + def issue_request(self, call_request, message_name, request_datum): + d = self.transceiver.transceive(call_request) + d.addCallback(self._process_handshake, message_name, request_datum) + return d + +class RequestStreamingProducer(object): + """A streaming producer for issuing requests with the Twisted.web Agent.""" + implements(IBodyProducer) + + paused = False + stopped = False + started = False + + def __init__(self, message): + self._message = message + self._length = len(message) + # We need a buffer length header for every buffer and an additional + # zero-length buffer as the message terminator + self._length += (self._length / ipc.BUFFER_SIZE + 2) \ + * ipc.BUFFER_HEADER_LENGTH + self._total_bytes_sent = 0 + self._deferred = Deferred() + + # read-only properties + message = property(lambda self: self._message) + length = property(lambda self: self._length) + consumer = property(lambda self: self._consumer) + deferred = property(lambda self: self._deferred) + + def _get_total_bytes_sent(self): + return self._total_bytes_sent + + def _set_total_bytes_sent(self, bytes_sent): + self._total_bytes_sent = bytes_sent + + total_bytes_sent = property(_get_total_bytes_sent, _set_total_bytes_sent) + + def startProducing(self, consumer): + if self.started: + return + + self.started = True + self._consumer = consumer + # Keep writing data to the consumer until we're finished, + # paused (pauseProducing()) or stopped (stopProducing()) + while self.length - self.total_bytes_sent > 0 and \ + not self.paused and not self.stopped: + self.write() + # self.write will fire this deferred once it has written + # the entire message to the consumer + return self.deferred + + def resumeProducing(self): + self.paused = False + self.write(self) + + def pauseProducing(self): + self.paused = True + + def stopProducing(self): + self.stopped = True + + def write(self): + if self.length - self.total_bytes_sent > ipc.BUFFER_SIZE: + buffer_length = ipc.BUFFER_SIZE + else: + buffer_length = self.length - self.total_bytes_sent + self.write_buffer(self.message[self.total_bytes_sent: + (self.total_bytes_sent + buffer_length)]) + self.total_bytes_sent += buffer_length + # Make sure we wrote the entire message + if self.total_bytes_sent == self.length and not self.stopped: + self.stopProducing() + # A message is always terminated by a zero-length buffer. + self.write_buffer_length(0) + self.deferred.callback(None) + + def write_buffer(self, chunk): + buffer_length = len(chunk) + self.write_buffer_length(buffer_length) + self.consumer.write(chunk) + + def write_buffer_length(self, n): + self.consumer.write(ipc.BIG_ENDIAN_INT_STRUCT.pack(n)) + +class AvroProtocol(Protocol): + + recvd = '' + done = False + + def __init__(self, finished): + self.finished = finished + self.message = [] + + def dataReceived(self, data): + self.recvd = self.recvd + data + while len(self.recvd) >= ipc.BUFFER_HEADER_LENGTH: + buffer_length ,= ipc.BIG_ENDIAN_INT_STRUCT.unpack( + self.recvd[:ipc.BUFFER_HEADER_LENGTH]) + if buffer_length == 0: + response = ''.join(self.message) + self.done = True + self.finished.callback(response) + break + if len(self.recvd) < buffer_length + ipc.BUFFER_HEADER_LENGTH: + break + buffer = self.recvd[ipc.BUFFER_HEADER_LENGTH:buffer_length + ipc.BUFFER_HEADER_LENGTH] + self.recvd = self.recvd[buffer_length + ipc.BUFFER_HEADER_LENGTH:] + self.message.append(buffer) + + def connectionLost(self, reason): + if not self.done: + self.finished.errback(ipc.ConnectionClosedException("Reader read 0 bytes.")) + +class TwistedHTTPTransceiver(object): + """This transceiver uses the Agent class present in Twisted.web >= 9.0 + for issuing requests to the remote endpoint.""" + def __init__(self, host, port, remote_name=None, reactor=None): + self.url = "http://%s:%d/" % (host, port) + + if remote_name is None: + # There's no easy way to get this peer's remote address + # in Twisted so I use a random UUID to identify ourselves + import uuid + self.remote_name = uuid.uuid4() + + if reactor is None: + from twisted.internet import reactor + self.agent = Agent(reactor) + + def read_framed_message(self, response): + finished = Deferred() + response.deliverBody(AvroProtocol(finished)) + return finished + + def transceive(self, request): + req_method = 'POST' + req_headers = { + 'Content-Type': ['avro/binary'], + 'Accept-Encoding': ['identity'], + } + + body_producer = RequestStreamingProducer(request) + d = self.agent.request( + req_method, + self.url, + headers=Headers(req_headers), + bodyProducer=body_producer) + return d.addCallback(self.read_framed_message) + +class AvroResponderResource(resource.Resource): + """This Twisted.web resource can be placed anywhere in a URL hierarchy + to provide an Avro endpoint. Different Avro protocols can be served + by the same web server as long as they are in different resources in + a URL hierarchy.""" + isLeaf = True + + def __init__(self, responder): + resource.Resource.__init__(self) + self.responder = responder + + def cb_render_POST(self, resp_body, request): + request.setResponseCode(200) + request.setHeader('Content-Type', 'avro/binary') + resp_writer = ipc.FramedWriter(request) + resp_writer.write_framed_message(resp_body) + request.finish() + + def render_POST(self, request): + # Unfortunately, Twisted.web doesn't support incoming + # streamed input yet, the whole payload must be kept in-memory + request.content.seek(0, 0) + call_request_reader = ipc.FramedReader(request.content) + call_request = call_request_reader.read_framed_message() + d = maybeDeferred(self.responder.respond, call_request) + d.addCallback(self.cb_render_POST, request) + return server.NOT_DONE_YET diff --git a/desktop/core/ext-py/avro-1.5.0/test/test_datafile.py b/desktop/core/ext-py/avro-1.5.0/test/test_datafile.py new file mode 100644 index 0000000..2f6f550 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/test/test_datafile.py @@ -0,0 +1,149 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +from avro import schema +from avro import io +from avro import datafile + +SCHEMAS_TO_VALIDATE = ( + ('"null"', None), + ('"boolean"', True), + ('"string"', unicode('adsfasdf09809dsf-=adsf')), + ('"bytes"', '12345abcd'), + ('"int"', 1234), + ('"long"', 1234), + ('"float"', 1234.0), + ('"double"', 1234.0), + ('{"type": "fixed", "name": "Test", "size": 1}', 'B'), + ('{"type": "enum", "name": "Test", "symbols": ["A", "B"]}', 'B'), + ('{"type": "array", "items": "long"}', [1, 3, 2]), + ('{"type": "map", "values": "long"}', {'a': 1, 'b': 3, 'c': 2}), + ('["string", "null", "long"]', None), + ("""\ + {"type": "record", + "name": "Test", + "fields": [{"name": "f", "type": "long"}]} + """, {'f': 5}), + ("""\ + {"type": "record", + "name": "Lisp", + "fields": [{"name": "value", + "type": ["null", "string", + {"type": "record", + "name": "Cons", + "fields": [{"name": "car", "type": "Lisp"}, + {"name": "cdr", "type": "Lisp"}]}]}]} + """, {'value': {'car': {'value': 'head'}, 'cdr': {'value': None}}}), +) + +FILENAME = 'test_datafile.out' +CODECS_TO_VALIDATE = ('null', 'deflate') + +# TODO(hammer): clean up written files with ant, not os.remove +class TestDataFile(unittest.TestCase): + def test_round_trip(self): + print '' + print 'TEST ROUND TRIP' + print '===============' + print '' + correct = 0 + for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): + for codec in CODECS_TO_VALIDATE: + print '' + print 'SCHEMA NUMBER %d' % (i + 1) + print '================' + print '' + print 'Schema: %s' % example_schema + print 'Datum: %s' % datum + print 'Codec: %s' % codec + + # write data in binary to file 10 times + writer = open(FILENAME, 'wb') + datum_writer = io.DatumWriter() + schema_object = schema.parse(example_schema) + dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) + for i in range(10): + dfw.append(datum) + dfw.close() + + # read data in binary from file + reader = open(FILENAME, 'rb') + datum_reader = io.DatumReader() + dfr = datafile.DataFileReader(reader, datum_reader) + round_trip_data = [] + for datum in dfr: + round_trip_data.append(datum) + + print 'Round Trip Data: %s' % round_trip_data + print 'Round Trip Data Length: %d' % len(round_trip_data) + is_correct = [datum] * 10 == round_trip_data + if is_correct: correct += 1 + print 'Correct Round Trip: %s' % is_correct + print '' + os.remove(FILENAME) + self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE)) + + def test_append(self): + print '' + print 'TEST APPEND' + print '===========' + print '' + correct = 0 + for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): + for codec in CODECS_TO_VALIDATE: + print '' + print 'SCHEMA NUMBER %d' % (i + 1) + print '================' + print '' + print 'Schema: %s' % example_schema + print 'Datum: %s' % datum + print 'Codec: %s' % codec + + # write data in binary to file once + writer = open(FILENAME, 'wb') + datum_writer = io.DatumWriter() + schema_object = schema.parse(example_schema) + dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) + dfw.append(datum) + dfw.close() + + # open file, write, and close nine times + for i in range(9): + writer = open(FILENAME, 'ab+') + dfw = datafile.DataFileWriter(writer, io.DatumWriter()) + dfw.append(datum) + dfw.close() + + # read data in binary from file + reader = open(FILENAME, 'rb') + datum_reader = io.DatumReader() + dfr = datafile.DataFileReader(reader, datum_reader) + appended_data = [] + for datum in dfr: + appended_data.append(datum) + + print 'Appended Data: %s' % appended_data + print 'Appended Data Length: %d' % len(appended_data) + is_correct = [datum] * 10 == appended_data + if is_correct: correct += 1 + print 'Correct Appended: %s' % is_correct + print '' + os.remove(FILENAME) + self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE)) + +if __name__ == '__main__': + unittest.main() diff --git a/desktop/core/ext-py/avro-1.5.0/test/test_datafile_interop.py b/desktop/core/ext-py/avro-1.5.0/test/test_datafile_interop.py new file mode 100644 index 0000000..d4618b4 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/test/test_datafile_interop.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +from avro import io +from avro import datafile + +class TestDataFileInterop(unittest.TestCase): + def test_interop(self): + print '' + print 'TEST INTEROP' + print '============' + print '' + for f in os.listdir('/home/cutting/src/avro/release-1.5.0-rc2/lang/py/../../build/interop/data'): + print 'READING %s' % f + print '' + + # read data in binary from file + reader = open(os.path.join('/home/cutting/src/avro/release-1.5.0-rc2/lang/py/../../build/interop/data', f), 'rb') + datum_reader = io.DatumReader() + dfr = datafile.DataFileReader(reader, datum_reader) + for datum in dfr: + assert datum is not None + +if __name__ == '__main__': + unittest.main() diff --git a/desktop/core/ext-py/avro-1.5.0/test/test_io.py b/desktop/core/ext-py/avro-1.5.0/test/test_io.py new file mode 100644 index 0000000..05a6f80 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/test/test_io.py @@ -0,0 +1,337 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from binascii import hexlify +from avro import schema +from avro import io + +SCHEMAS_TO_VALIDATE = ( + ('"null"', None), + ('"boolean"', True), + ('"string"', unicode('adsfasdf09809dsf-=adsf')), + ('"bytes"', '12345abcd'), + ('"int"', 1234), + ('"long"', 1234), + ('"float"', 1234.0), + ('"double"', 1234.0), + ('{"type": "fixed", "name": "Test", "size": 1}', 'B'), + ('{"type": "enum", "name": "Test", "symbols": ["A", "B"]}', 'B'), + ('{"type": "array", "items": "long"}', [1, 3, 2]), + ('{"type": "map", "values": "long"}', {'a': 1, 'b': 3, 'c': 2}), + ('["string", "null", "long"]', None), + ("""\ + {"type": "record", + "name": "Test", + "fields": [{"name": "f", "type": "long"}]} + """, {'f': 5}), + ("""\ + {"type": "record", + "name": "Lisp", + "fields": [{"name": "value", + "type": ["null", "string", + {"type": "record", + "name": "Cons", + "fields": [{"name": "car", "type": "Lisp"}, + {"name": "cdr", "type": "Lisp"}]}]}]} + """, {'value': {'car': {'value': 'head'}, 'cdr': {'value': None}}}), +) + +BINARY_ENCODINGS = ( + (0, '00'), + (-1, '01'), + (1, '02'), + (-2, '03'), + (2, '04'), + (-64, '7f'), + (64, '80 01'), + (8192, '80 80 01'), + (-8193, '81 80 01'), +) + +DEFAULT_VALUE_EXAMPLES = ( + ('"null"', 'null', None), + ('"boolean"', 'true', True), + ('"string"', '"foo"', u'foo'), + ('"bytes"', '"\u00FF\u00FF"', u'\xff\xff'), + ('"int"', '5', 5), + ('"long"', '5', 5L), + ('"float"', '1.1', 1.1), + ('"double"', '1.1', 1.1), + ('{"type": "fixed", "name": "F", "size": 2}', '"\u00FF\u00FF"', u'\xff\xff'), + ('{"type": "enum", "name": "F", "symbols": ["FOO", "BAR"]}', '"FOO"', 'FOO'), + ('{"type": "array", "items": "int"}', '[1, 2, 3]', [1, 2, 3]), + ('{"type": "map", "values": "int"}', '{"a": 1, "b": 2}', {'a': 1, 'b': 2}), + ('["int", "null"]', '5', 5), + ('{"type": "record", "name": "F", "fields": [{"name": "A", "type": "int"}]}', + '{"A": 5}', {'A': 5}), +) + +LONG_RECORD_SCHEMA = schema.parse("""\ + {"type": "record", + "name": "Test", + "fields": [{"name": "A", "type": "int"}, + {"name": "B", "type": "int"}, + {"name": "C", "type": "int"}, + {"name": "D", "type": "int"}, + {"name": "E", "type": "int"}, + {"name": "F", "type": "int"}, + {"name": "G", "type": "int"}]}""") + +LONG_RECORD_DATUM = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7} + +def avro_hexlify(reader): + """Return the hex value, as a string, of a binary-encoded int or long.""" + bytes = [] + current_byte = reader.read(1) + bytes.append(hexlify(current_byte)) + while (ord(current_byte) & 0x80) != 0: + current_byte = reader.read(1) + bytes.append(hexlify(current_byte)) + return ' '.join(bytes) + +def print_test_name(test_name): + print '' + print test_name + print '=' * len(test_name) + print '' + +def write_datum(datum, writers_schema): + writer = StringIO() + encoder = io.BinaryEncoder(writer) + datum_writer = io.DatumWriter(writers_schema) + datum_writer.write(datum, encoder) + return writer, encoder, datum_writer + +def read_datum(buffer, writers_schema, readers_schema=None): + reader = StringIO(buffer.getvalue()) + decoder = io.BinaryDecoder(reader) + datum_reader = io.DatumReader(writers_schema, readers_schema) + return datum_reader.read(decoder) + +def check_binary_encoding(number_type): + print_test_name('TEST BINARY %s ENCODING' % number_type.upper()) + correct = 0 + for datum, hex_encoding in BINARY_ENCODINGS: + print 'Datum: %d' % datum + print 'Correct Encoding: %s' % hex_encoding + + writers_schema = schema.parse('"%s"' % number_type.lower()) + writer, encoder, datum_writer = write_datum(datum, writers_schema) + writer.seek(0) + hex_val = avro_hexlify(writer) + + print 'Read Encoding: %s' % hex_val + if hex_encoding == hex_val: correct += 1 + print '' + return correct + +def check_skip_number(number_type): + print_test_name('TEST SKIP %s' % number_type.upper()) + correct = 0 + for value_to_skip, hex_encoding in BINARY_ENCODINGS: + VALUE_TO_READ = 6253 + print 'Value to Skip: %d' % value_to_skip + + # write the value to skip and a known value + writers_schema = schema.parse('"%s"' % number_type.lower()) + writer, encoder, datum_writer = write_datum(value_to_skip, writers_schema) + datum_writer.write(VALUE_TO_READ, encoder) + + # skip the value + reader = StringIO(writer.getvalue()) + decoder = io.BinaryDecoder(reader) + decoder.skip_long() + + # read data from string buffer + datum_reader = io.DatumReader(writers_schema) + read_value = datum_reader.read(decoder) + + print 'Read Value: %d' % read_value + if read_value == VALUE_TO_READ: correct += 1 + print '' + return correct + +class TestIO(unittest.TestCase): + # + # BASIC FUNCTIONALITY + # + + def test_validate(self): + print_test_name('TEST VALIDATE') + passed = 0 + for example_schema, datum in SCHEMAS_TO_VALIDATE: + print 'Schema: %s' % example_schema + print 'Datum: %s' % datum + validated = io.validate(schema.parse(example_schema), datum) + print 'Valid: %s' % validated + if validated: passed += 1 + self.assertEquals(passed, len(SCHEMAS_TO_VALIDATE)) + + def test_round_trip(self): + print_test_name('TEST ROUND TRIP') + correct = 0 + for example_schema, datum in SCHEMAS_TO_VALIDATE: + print 'Schema: %s' % example_schema + print 'Datum: %s' % datum + + writers_schema = schema.parse(example_schema) + writer, encoder, datum_writer = write_datum(datum, writers_schema) + round_trip_datum = read_datum(writer, writers_schema) + + print 'Round Trip Datum: %s' % round_trip_datum + if datum == round_trip_datum: correct += 1 + self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE)) + + # + # BINARY ENCODING OF INT AND LONG + # + + def test_binary_int_encoding(self): + correct = check_binary_encoding('int') + self.assertEquals(correct, len(BINARY_ENCODINGS)) + + def test_binary_long_encoding(self): + correct = check_binary_encoding('long') + self.assertEquals(correct, len(BINARY_ENCODINGS)) + + def test_skip_int(self): + correct = check_skip_number('int') + self.assertEquals(correct, len(BINARY_ENCODINGS)) + + def test_skip_long(self): + correct = check_skip_number('long') + self.assertEquals(correct, len(BINARY_ENCODINGS)) + + # + # SCHEMA RESOLUTION + # + + def test_schema_promotion(self): + print_test_name('TEST SCHEMA PROMOTION') + # note that checking writers_schema.type in read_data + # allows us to handle promotion correctly + promotable_schemas = ['"int"', '"long"', '"float"', '"double"'] + incorrect = 0 + for i, ws in enumerate(promotable_schemas): + writers_schema = schema.parse(ws) + datum_to_write = 219 + for rs in promotable_schemas[i + 1:]: + readers_schema = schema.parse(rs) + writer, enc, dw = write_datum(datum_to_write, writers_schema) + datum_read = read_datum(writer, writers_schema, readers_schema) + print 'Writer: %s Reader: %s' % (writers_schema, readers_schema) + print 'Datum Read: %s' % datum_read + if datum_read != datum_to_write: incorrect += 1 + self.assertEquals(incorrect, 0) + + def test_unknown_symbol(self): + print_test_name('TEST UNKNOWN SYMBOL') + writers_schema = schema.parse("""\ + {"type": "enum", "name": "Test", + "symbols": ["FOO", "BAR"]}""") + datum_to_write = 'FOO' + + readers_schema = schema.parse("""\ + {"type": "enum", "name": "Test", + "symbols": ["BAR", "BAZ"]}""") + + writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) + reader = StringIO(writer.getvalue()) + decoder = io.BinaryDecoder(reader) + datum_reader = io.DatumReader(writers_schema, readers_schema) + self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder) + + def test_default_value(self): + print_test_name('TEST DEFAULT VALUE') + writers_schema = LONG_RECORD_SCHEMA + datum_to_write = LONG_RECORD_DATUM + + correct = 0 + for field_type, default_json, default_datum in DEFAULT_VALUE_EXAMPLES: + readers_schema = schema.parse("""\ + {"type": "record", "name": "Test", + "fields": [{"name": "H", "type": %s, "default": %s}]} + """ % (field_type, default_json)) + datum_to_read = {'H': default_datum} + + writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) + datum_read = read_datum(writer, writers_schema, readers_schema) + print 'Datum Read: %s' % datum_read + if datum_to_read == datum_read: correct += 1 + self.assertEquals(correct, len(DEFAULT_VALUE_EXAMPLES)) + + def test_no_default_value(self): + print_test_name('TEST NO DEFAULT VALUE') + writers_schema = LONG_RECORD_SCHEMA + datum_to_write = LONG_RECORD_DATUM + + readers_schema = schema.parse("""\ + {"type": "record", "name": "Test", + "fields": [{"name": "H", "type": "int"}]}""") + + writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) + reader = StringIO(writer.getvalue()) + decoder = io.BinaryDecoder(reader) + datum_reader = io.DatumReader(writers_schema, readers_schema) + self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder) + + def test_projection(self): + print_test_name('TEST PROJECTION') + writers_schema = LONG_RECORD_SCHEMA + datum_to_write = LONG_RECORD_DATUM + + readers_schema = schema.parse("""\ + {"type": "record", "name": "Test", + "fields": [{"name": "E", "type": "int"}, + {"name": "F", "type": "int"}]}""") + datum_to_read = {'E': 5, 'F': 6} + + writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) + datum_read = read_datum(writer, writers_schema, readers_schema) + print 'Datum Read: %s' % datum_read + self.assertEquals(datum_to_read, datum_read) + + def test_field_order(self): + print_test_name('TEST FIELD ORDER') + writers_schema = LONG_RECORD_SCHEMA + datum_to_write = LONG_RECORD_DATUM + + readers_schema = schema.parse("""\ + {"type": "record", "name": "Test", + "fields": [{"name": "F", "type": "int"}, + {"name": "E", "type": "int"}]}""") + datum_to_read = {'E': 5, 'F': 6} + + writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) + datum_read = read_datum(writer, writers_schema, readers_schema) + print 'Datum Read: %s' % datum_read + self.assertEquals(datum_to_read, datum_read) + + def test_type_exception(self): + print_test_name('TEST TYPE EXCEPTION') + writers_schema = schema.parse("""\ + {"type": "record", "name": "Test", + "fields": [{"name": "F", "type": "int"}, + {"name": "E", "type": "int"}]}""") + datum_to_write = {'E': 5, 'F': 'Bad'} + self.assertRaises(io.AvroTypeException, write_datum, datum_to_write, writers_schema) + +if __name__ == '__main__': + unittest.main() diff --git a/desktop/core/ext-py/avro-1.5.0/test/test_ipc.py b/desktop/core/ext-py/avro-1.5.0/test/test_ipc.py new file mode 100644 index 0000000..1d1b733 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/test/test_ipc.py @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +There are currently no IPC tests within python, in part because there are no +servers yet available. +""" +import unittest + +# This test does import this code, to make sure it at least passes +# compilation. +import avro.ipc + +class TestIPC(unittest.TestCase): + def test_placeholder(self): + pass + +if __name__ == '__main__': + unittest.main() diff --git a/desktop/core/ext-py/avro-1.5.0/test/test_protocol.py b/desktop/core/ext-py/avro-1.5.0/test/test_protocol.py new file mode 100644 index 0000000..06bda40 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/test/test_protocol.py @@ -0,0 +1,422 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test the protocol parsing logic. +""" +import unittest +from avro import protocol + +class ExampleProtocol(object): + def __init__(self, protocol_string, valid, name='', comment=''): + self._protocol_string = protocol_string + self._valid = valid + self._name = name or protocol_string # default to schema_string for name + self._comment = comment + + # read-only properties + protocol_string = property(lambda self: self._protocol_string) + valid = property(lambda self: self._valid) + name = property(lambda self: self._name) + + # read/write properties + def set_comment(self, new_comment): self._comment = new_comment + comment = property(lambda self: self._comment, set_comment) + +# +# Example Protocols +# + +EXAMPLES = [ + ExampleProtocol("""\ +{ + "namespace": "com.acme", + "protocol": "HelloWorld", + + "types": [ + {"name": "Greeting", "type": "record", "fields": [ + {"name": "message", "type": "string"}]}, + {"name": "Curse", "type": "error", "fields": [ + {"name": "message", "type": "string"}]} + ], + + "messages": { + "hello": { + "request": [{"name": "greeting", "type": "Greeting" }], + "response": "Greeting", + "errors": ["Curse"] + } + } +} + """, True), + ExampleProtocol("""\ +{"namespace": "org.apache.avro.test", + "protocol": "Simple", + + "types": [ + {"name": "Kind", "type": "enum", "symbols": ["FOO","BAR","BAZ"]}, + + {"name": "MD5", "type": "fixed", "size": 16}, + + {"name": "TestRecord", "type": "record", + "fields": [ + {"name": "name", "type": "string", "order": "ignore"}, + {"name": "kind", "type": "Kind", "order": "descending"}, + {"name": "hash", "type": "MD5"} + ] + }, + + {"name": "TestError", "type": "error", "fields": [ + {"name": "message", "type": "string"} + ] + } + + ], + + "messages": { + + "hello": { + "request": [{"name": "greeting", "type": "string"}], + "response": "string" + }, + + "echo": { + "request": [{"name": "record", "type": "TestRecord"}], + "response": "TestRecord" + }, + + "add": { + "request": [{"name": "arg1", "type": "int"}, {"name": "arg2", "type": "int"}], + "response": "int" + }, + + "echoBytes": { + "request": [{"name": "data", "type": "bytes"}], + "response": "bytes" + }, + + "error": { + "request": [], + "response": "null", + "errors": ["TestError"] + } + } + +} + """, True), + ExampleProtocol("""\ +{"namespace": "org.apache.avro.test.namespace", + "protocol": "TestNamespace", + + "types": [ + {"name": "org.apache.avro.test.util.MD5", "type": "fixed", "size": 16}, + {"name": "TestRecord", "type": "record", + "fields": [ {"name": "hash", "type": "org.apache.avro.test.util.MD5"} ] + }, + {"name": "TestError", "namespace": "org.apache.avro.test.errors", + "type": "error", "fields": [ {"name": "message", "type": "string"} ] + } + ], + + "messages": { + "echo": { + "request": [{"name": "record", "type": "TestRecord"}], + "response": "TestRecord" + }, + + "error": { + "request": [], + "response": "null", + "errors": ["org.apache.avro.test.errors.TestError"] + } + + } + +} + """, True), +ExampleProtocol("""\ +{"namespace": "org.apache.avro.test.namespace", + "protocol": "TestImplicitNamespace", + + "types": [ + {"name": "org.apache.avro.test.util.MD5", "type": "fixed", "size": 16}, + {"name": "ReferencedRecord", "type": "record", + "fields": [ {"name": "foo", "type": "string"} ] }, + {"name": "TestRecord", "type": "record", + "fields": [ {"name": "hash", "type": "org.apache.avro.test.util.MD5"}, + {"name": "unqalified", "type": "ReferencedRecord"} ] + }, + {"name": "TestError", + "type": "error", "fields": [ {"name": "message", "type": "string"} ] + } + ], + + "messages": { + "echo": { + "request": [{"name": "qualified", + "type": "org.apache.avro.test.namespace.TestRecord"}], + "response": "TestRecord" + }, + + "error": { + "request": [], + "response": "null", + "errors": ["org.apache.avro.test.namespace.TestError"] + } + + } + +} + """, True), +ExampleProtocol("""\ +{"namespace": "org.apache.avro.test.namespace", + "protocol": "TestNamespaceTwo", + + "types": [ + {"name": "org.apache.avro.test.util.MD5", "type": "fixed", "size": 16}, + {"name": "ReferencedRecord", "type": "record", + "namespace": "org.apache.avro.other.namespace", + "fields": [ {"name": "foo", "type": "string"} ] }, + {"name": "TestRecord", "type": "record", + "fields": [ {"name": "hash", "type": "org.apache.avro.test.util.MD5"}, + {"name": "qualified", + "type": "org.apache.avro.other.namespace.ReferencedRecord"} + ] + }, + {"name": "TestError", + "type": "error", "fields": [ {"name": "message", "type": "string"} ] + } + ], + + "messages": { + "echo": { + "request": [{"name": "qualified", + "type": "org.apache.avro.test.namespace.TestRecord"}], + "response": "TestRecord" + }, + + "error": { + "request": [], + "response": "null", + "errors": ["org.apache.avro.test.namespace.TestError"] + } + + } + +} + """, True), +ExampleProtocol("""\ +{"namespace": "org.apache.avro.test.namespace", + "protocol": "TestValidRepeatedName", + + "types": [ + {"name": "org.apache.avro.test.util.MD5", "type": "fixed", "size": 16}, + {"name": "ReferencedRecord", "type": "record", + "namespace": "org.apache.avro.other.namespace", + "fields": [ {"name": "foo", "type": "string"} ] }, + {"name": "ReferencedRecord", "type": "record", + "fields": [ {"name": "bar", "type": "double"} ] }, + {"name": "TestError", + "type": "error", "fields": [ {"name": "message", "type": "string"} ] + } + ], + + "messages": { + "echo": { + "request": [{"name": "qualified", + "type": "ReferencedRecord"}], + "response": "org.apache.avro.other.namespace.ReferencedRecord" + }, + + "error": { + "request": [], + "response": "null", + "errors": ["org.apache.avro.test.namespace.TestError"] + } + + } + +} + """, True), +ExampleProtocol("""\ +{"namespace": "org.apache.avro.test.namespace", + "protocol": "TestInvalidRepeatedName", + + "types": [ + {"name": "org.apache.avro.test.util.MD5", "type": "fixed", "size": 16}, + {"name": "ReferencedRecord", "type": "record", + "fields": [ {"name": "foo", "type": "string"} ] }, + {"name": "ReferencedRecord", "type": "record", + "fields": [ {"name": "bar", "type": "double"} ] }, + {"name": "TestError", + "type": "error", "fields": [ {"name": "message", "type": "string"} ] + } + ], + + "messages": { + "echo": { + "request": [{"name": "qualified", + "type": "ReferencedRecord"}], + "response": "org.apache.avro.other.namespace.ReferencedRecord" + }, + + "error": { + "request": [], + "response": "null", + "errors": ["org.apache.avro.test.namespace.TestError"] + } + + } + +} + """, False), + ExampleProtocol("""\ +{"namespace": "org.apache.avro.test", + "protocol": "BulkData", + + "types": [], + + "messages": { + + "read": { + "request": [], + "response": "bytes" + }, + + "write": { + "request": [ {"name": "data", "type": "bytes"} ], + "response": "null" + } + + } + +} + """, True), + ExampleProtocol("""\ +{ + "protocol" : "API", + "namespace" : "xyz.api", + "types" : [ { + "type" : "enum", + "name" : "Symbology", + "namespace" : "xyz.api.product", + "symbols" : [ "OPRA", "CUSIP", "ISIN", "SEDOL" ] + }, { + "type" : "record", + "name" : "Symbol", + "namespace" : "xyz.api.product", + "fields" : [ { + "name" : "symbology", + "type" : "xyz.api.product.Symbology" + }, { + "name" : "symbol", + "type" : "string" + } ] + }, { + "type" : "record", + "name" : "MultiSymbol", + "namespace" : "xyz.api.product", + "fields" : [ { + "name" : "symbols", + "type" : { + "type" : "map", + "values" : "xyz.api.product.Symbol" + } + } ] + } ], + "messages" : { + } +} + """, True), +] + +VALID_EXAMPLES = [e for e in EXAMPLES if e.valid] + +class TestProtocol(unittest.TestCase): + def test_parse(self): + num_correct = 0 + for example in EXAMPLES: + try: + protocol.parse(example.protocol_string) + if example.valid: + num_correct += 1 + else: + self.fail("Parsed invalid protocol: %s" % (example.name,)) + except Exception, e: + if not example.valid: + num_correct += 1 + else: + self.fail("Coudl not parse valid protocol: %s" % (example.name,)) + + fail_msg = "Parse behavior correct on %d out of %d protocols." % \ + (num_correct, len(EXAMPLES)) + self.assertEqual(num_correct, len(EXAMPLES), fail_msg) + + def test_valid_cast_to_string_after_parse(self): + """ + Test that the string generated by an Avro Protocol object + is, in fact, a valid Avro protocol. + """ + print '' + print 'TEST CAST TO STRING' + print '===================' + print '' + + num_correct = 0 + for example in VALID_EXAMPLES: + protocol_data = protocol.parse(example.protocol_string) + try: + try: + protocol.parse(str(protocol_data)) + debug_msg = "%s: STRING CAST SUCCESS" % example.name + num_correct += 1 + except: + debug_msg = "%s: STRING CAST FAILURE" % example.name + finally: + print debug_msg + + fail_msg = "Cast to string success on %d out of %d protocols" % \ + (num_correct, len(VALID_EXAMPLES)) + self.assertEqual(num_correct, len(VALID_EXAMPLES), fail_msg) + + def test_equivalence_after_round_trip(self): + """ + 1. Given a string, parse it to get Avro protocol "original". + 2. Serialize "original" to a string and parse that string + to generate Avro protocol "round trip". + 3. Ensure "original" and "round trip" protocols are equivalent. + """ + print '' + print 'TEST ROUND TRIP' + print '===============' + print '' + + num_correct = 0 + for example in VALID_EXAMPLES: + original_protocol = protocol.parse(example.protocol_string) + round_trip_protocol = protocol.parse(str(original_protocol)) + + if original_protocol == round_trip_protocol: + num_correct += 1 + debug_msg = "%s: ROUND TRIP SUCCESS" % example.name + else: + self.fail("Round trip failure: %s %s %s", (example.name, example.protocol_string, str(original_protocol))) + + fail_msg = "Round trip success on %d out of %d protocols" % \ + (num_correct, len(VALID_EXAMPLES)) + self.assertEqual(num_correct, len(VALID_EXAMPLES), fail_msg) + +if __name__ == '__main__': + unittest.main() diff --git a/desktop/core/ext-py/avro-1.5.0/test/test_schema.py b/desktop/core/ext-py/avro-1.5.0/test/test_schema.py new file mode 100644 index 0000000..4700144 --- /dev/null +++ b/desktop/core/ext-py/avro-1.5.0/test/test_schema.py @@ -0,0 +1,394 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test the schema parsing logic. +""" +import unittest +from avro import schema + +def print_test_name(test_name): + print '' + print test_name + print '=' * len(test_name) + print '' + +class ExampleSchema(object): + def __init__(self, schema_string, valid, name='', comment=''): + self._schema_string = schema_string + self._valid = valid + self._name = name or schema_string # default to schema_string for name + self.comment = comment + + @property + def schema_string(self): + return self._schema_string + + @property + def valid(self): + return self._valid + + @property + def name(self): + return self._name + +# +# Example Schemas +# + +def make_primitive_examples(): + examples = [] + for type in schema.PRIMITIVE_TYPES: + examples.append(ExampleSchema('"%s"' % type, True)) + examples.append(ExampleSchema('{"type": "%s"}' % type, True)) + return examples + +PRIMITIVE_EXAMPLES = [ + ExampleSchema('"True"', False), + ExampleSchema('True', False), + ExampleSchema('{"no_type": "test"}', False), + ExampleSchema('{"type": "panther"}', False), +] + make_primitive_examples() + +FIXED_EXAMPLES = [ + ExampleSchema('{"type": "fixed", "name": "Test", "size": 1}', True), + ExampleSchema("""\ + {"type": "fixed", + "name": "MyFixed", + "namespace": "org.apache.hadoop.avro", + "size": 1} + """, True), + ExampleSchema("""\ + {"type": "fixed", + "name": "Missing size"} + """, False), + ExampleSchema("""\ + {"type": "fixed", + "size": 314} + """, False), +] + +ENUM_EXAMPLES = [ + ExampleSchema('{"type": "enum", "name": "Test", "symbols": ["A", "B"]}', True), + ExampleSchema("""\ + {"type": "enum", + "name": "Status", + "symbols": "Normal Caution Critical"} + """, False), + ExampleSchema("""\ + {"type": "enum", + "name": [ 0, 1, 1, 2, 3, 5, 8 ], + "symbols": ["Golden", "Mean"]} + """, False), + ExampleSchema("""\ + {"type": "enum", + "symbols" : ["I", "will", "fail", "no", "name"]} + """, False), + ExampleSchema("""\ + {"type": "enum", + "name": "Test" + "symbols" : ["AA", "AA"]} + """, False), +] + +ARRAY_EXAMPLES = [ + ExampleSchema('{"type": "array", "items": "long"}', True), + ExampleSchema("""\ + {"type": "array", + "items": {"type": "enum", "name": "Test", "symbols": ["A", "B"]}} + """, True), +] + +MAP_EXAMPLES = [ + ExampleSchema('{"type": "map", "values": "long"}', True), + ExampleSchema("""\ + {"type": "map", + "values": {"type": "enum", "name": "Test", "symbols": ["A", "B"]}} + """, True), +] + +UNION_EXAMPLES = [ + ExampleSchema('["string", "null", "long"]', True), + ExampleSchema('["null", "null"]', False), + ExampleSchema('["long", "long"]', False), + ExampleSchema("""\ + [{"type": "array", "items": "long"} + {"type": "array", "items": "string"}] + """, False), +] + +RECORD_EXAMPLES = [ + ExampleSchema("""\ + {"type": "record", + "name": "Test", + "fields": [{"name": "f", + "type": "long"}]} + """, True), + ExampleSchema("""\ + {"type": "error", + "name": "Test", + "fields": [{"name": "f", + "type": "long"}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "Node", + "fields": [{"name": "label", "type": "string"}, + {"name": "children", + "type": {"type": "array", "items": "Node"}}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "Lisp", + "fields": [{"name": "value", + "type": ["null", "string", + {"type": "record", + "name": "Cons", + "fields": [{"name": "car", "type": "Lisp"}, + {"name": "cdr", "type": "Lisp"}]}]}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "HandshakeRequest", + "namespace": "org.apache.avro.ipc", + "fields": [{"name": "clientHash", + "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "clientProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": "MD5"}, + {"name": "meta", + "type": ["null", {"type": "map", "values": "bytes"}]}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "HandshakeResponse", + "namespace": "org.apache.avro.ipc", + "fields": [{"name": "match", + "type": {"type": "enum", + "name": "HandshakeMatch", + "symbols": ["BOTH", "CLIENT", "NONE"]}}, + {"name": "serverProtocol", "type": ["null", "string"]}, + {"name": "serverHash", + "type": ["null", + {"name": "MD5", "size": 16, "type": "fixed"}]}, + {"name": "meta", + "type": ["null", {"type": "map", "values": "bytes"}]}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "Interop", + "namespace": "org.apache.avro", + "fields": [{"name": "intField", "type": "int"}, + {"name": "longField", "type": "long"}, + {"name": "stringField", "type": "string"}, + {"name": "boolField", "type": "boolean"}, + {"name": "floatField", "type": "float"}, + {"name": "doubleField", "type": "double"}, + {"name": "bytesField", "type": "bytes"}, + {"name": "nullField", "type": "null"}, + {"name": "arrayField", + "type": {"type": "array", "items": "double"}}, + {"name": "mapField", + "type": {"type": "map", + "values": {"name": "Foo", + "type": "record", + "fields": [{"name": "label", + "type": "string"}]}}}, + {"name": "unionField", + "type": ["boolean", + "double", + {"type": "array", "items": "bytes"}]}, + {"name": "enumField", + "type": {"type": "enum", + "name": "Kind", + "symbols": ["A", "B", "C"]}}, + {"name": "fixedField", + "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "recordField", + "type": {"type": "record", + "name": "Node", + "fields": [{"name": "label", "type": "string"}, + {"name": "children", + "type": {"type": "array", + "items": "Node"}}]}}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "ipAddr", + "fields": [{"name": "addr", + "type": [{"name": "IPv6", "type": "fixed", "size": 16}, + {"name": "IPv4", "type": "fixed", "size": 4}]}]} + """, True), + ExampleSchema("""\ + {"type": "record", + "name": "Address", + "fields": [{"type": "string"}, + {"type": "string", "name": "City"}]} + """, False), + ExampleSchema("""\ + {"type": "record", + "name": "Event", + "fields": [{"name": "Sponsor"}, + {"name": "City", "type": "string"}]} + """, False), + ExampleSchema("""\ + {"type": "record", + "fields": "His vision, from the constantly passing bars," + "name", "Rainer"} + """, False), + ExampleSchema("""\ + {"name": ["Tom", "Jerry"], + "type": "record", + "fields": [{"name": "name", "type": "string"}]} + """, False), +] + +EXAMPLES = PRIMITIVE_EXAMPLES +EXAMPLES += FIXED_EXAMPLES +EXAMPLES += ENUM_EXAMPLES +EXAMPLES += ARRAY_EXAMPLES +EXAMPLES += MAP_EXAMPLES +EXAMPLES += UNION_EXAMPLES +EXAMPLES += RECORD_EXAMPLES + +VALID_EXAMPLES = [e for e in EXAMPLES if e.valid] + +# TODO(hammer): refactor into harness for examples +# TODO(hammer): pretty-print detailed output +# TODO(hammer): make verbose flag +# TODO(hammer): show strack trace to user +# TODO(hammer): use logging module? +class TestSchema(unittest.TestCase): + + def test_correct_recursive_extraction(self): + s = schema.parse('{"type": "record", "name": "X", "fields": [{"name": "y", "type": {"type": "record", "name": "Y", "fields": [{"name": "Z", "type": "X"}]}}]}') + t = schema.parse(str(s.fields[0].type)) + # If we've made it this far, the subschema was reasonably stringified; it ccould be reparsed. + self.assertEqual("X", t.fields[0].type.name) + + def test_parse(self): + correct = 0 + for example in EXAMPLES: + try: + schema.parse(example.schema_string) + if example.valid: + correct += 1 + else: + self.fail("Invalid schema was parsed: " + example.schema_string) + except: + if not example.valid: + correct += 1 + else: + self.fail("Valid schema failed to parse: " + example.schema_string) + + fail_msg = "Parse behavior correct on %d out of %d schemas." % \ + (correct, len(EXAMPLES)) + self.assertEqual(correct, len(EXAMPLES), fail_msg) + + def test_valid_cast_to_string_after_parse(self): + """ + Test that the string generated by an Avro Schema object + is, in fact, a valid Avro schema. + """ + print_test_name('TEST CAST TO STRING AFTER PARSE') + correct = 0 + for example in VALID_EXAMPLES: + schema_data = schema.parse(example.schema_string) + schema.parse(str(schema_data)) + correct += 1 + + fail_msg = "Cast to string success on %d out of %d schemas" % \ + (correct, len(VALID_EXAMPLES)) + self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg) + + def test_equivalence_after_round_trip(self): + """ + 1. Given a string, parse it to get Avro schema "original". + 2. Serialize "original" to a string and parse that string + to generate Avro schema "round trip". + 3. Ensure "original" and "round trip" schemas are equivalent. + """ + print_test_name('TEST ROUND TRIP') + correct = 0 + for example in VALID_EXAMPLES: + original_schema = schema.parse(example.schema_string) + round_trip_schema = schema.parse(str(original_schema)) + if original_schema == round_trip_schema: + correct += 1 + debug_msg = "%s: ROUND TRIP SUCCESS" % example.name + else: + debug_msg = "%s: ROUND TRIP FAILURE" % example.name + self.fail("Round trip failure: %s, %s, %s" % (example.name, original_schema, str(original_schema))) + + fail_msg = "Round trip success on %d out of %d schemas" % \ + (correct, len(VALID_EXAMPLES)) + self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg) + + # TODO(hammer): more tests + def test_fullname(self): + """ + The fullname is determined in one of the following ways: + * A name and namespace are both specified. For example, + one might use "name": "X", "namespace": "org.foo" + to indicate the fullname "org.foo.X". + * A fullname is specified. If the name specified contains + a dot, then it is assumed to be a fullname, and any + namespace also specified is ignored. For example, + use "name": "org.foo.X" to indicate the + fullname "org.foo.X". + * A name only is specified, i.e., a name that contains no + dots. In this case the namespace is taken from the most + tightly encosing schema or protocol. For example, + if "name": "X" is specified, and this occurs + within a field of the record definition + of "org.foo.Y", then the fullname is "org.foo.X". + + References to previously defined names are as in the latter + two cases above: if they contain a dot they are a fullname, if + they do not contain a dot, the namespace is the namespace of + the enclosing definition. + + Primitive type names have no namespace and their names may + not be defined in any namespace. A schema may only contain + multiple definitions of a fullname if the definitions are + equivalent. + """ + print_test_name('TEST FULLNAME') + + # name and namespace specified + fullname = schema.Name('a', 'o.a.h', None).fullname + self.assertEqual(fullname, 'o.a.h.a') + + # fullname and namespace specified + fullname = schema.Name('a.b.c.d', 'o.a.h', None).fullname + self.assertEqual(fullname, 'a.b.c.d') + + # name and default namespace specified + fullname = schema.Name('a', None, 'b.c.d').fullname + self.assertEqual(fullname, 'b.c.d.a') + + # fullname and default namespace specified + fullname = schema.Name('a.b.c.d', None, 'o.a.h').fullname + self.assertEqual(fullname, 'a.b.c.d') + + # fullname, namespace, default namespace specified + fullname = schema.Name('a.b.c.d', 'o.a.a', 'o.a.h').fullname + self.assertEqual(fullname, 'a.b.c.d') + + # name, namespace, default namespace specified + fullname = schema.Name('a', 'o.a.a', 'o.a.h').fullname + self.assertEqual(fullname, 'o.a.a.a') + +if __name__ == '__main__': + unittest.main() -- 1.7.4.4