HiveToCatalogConverter.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// Copied from
// https://github.com/awslabs/aws-glue-data-catalog-client-for-apache-hive-metastore/blob/branch-3.4.0/
//
package com.amazonaws.glue.catalog.converters;
import com.amazonaws.services.glue.model.BinaryColumnStatisticsData;
import com.amazonaws.services.glue.model.BooleanColumnStatisticsData;
import com.amazonaws.services.glue.model.ColumnStatisticsType;
import com.amazonaws.services.glue.model.DateColumnStatisticsData;
import com.amazonaws.services.glue.model.DecimalColumnStatisticsData;
import com.amazonaws.services.glue.model.DoubleColumnStatisticsData;
import com.amazonaws.services.glue.model.LongColumnStatisticsData;
import com.amazonaws.services.glue.model.StringColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;
import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Function;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.ResourceUri;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.SkewedInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Table;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
public class HiveToCatalogConverter {
public static com.amazonaws.services.glue.model.Database convertDatabase(Database hiveDatabase) {
com.amazonaws.services.glue.model.Database catalogDatabase = new com.amazonaws.services.glue.model.Database();
catalogDatabase.setName(hiveDatabase.getName());
catalogDatabase.setDescription(hiveDatabase.getDescription());
catalogDatabase.setLocationUri(hiveDatabase.getLocationUri());
catalogDatabase.setParameters(hiveDatabase.getParameters());
return catalogDatabase;
}
public static com.amazonaws.services.glue.model.Table convertTable(
Table hiveTable) {
com.amazonaws.services.glue.model.Table catalogTable = new com.amazonaws.services.glue.model.Table();
catalogTable.setRetention(hiveTable.getRetention());
catalogTable.setPartitionKeys(convertFieldSchemaList(hiveTable.getPartitionKeys()));
catalogTable.setTableType(hiveTable.getTableType());
catalogTable.setName(hiveTable.getTableName());
catalogTable.setOwner(hiveTable.getOwner());
catalogTable.setCreateTime(new Date((long) hiveTable.getCreateTime() * 1000));
catalogTable.setLastAccessTime(new Date((long) hiveTable.getLastAccessTime() * 1000));
catalogTable.setStorageDescriptor(convertStorageDescriptor(hiveTable.getSd()));
catalogTable.setParameters(hiveTable.getParameters());
catalogTable.setViewExpandedText(hiveTable.getViewExpandedText());
catalogTable.setViewOriginalText(hiveTable.getViewOriginalText());
return catalogTable;
}
public static com.amazonaws.services.glue.model.StorageDescriptor convertStorageDescriptor(
StorageDescriptor hiveSd) {
com.amazonaws.services.glue.model.StorageDescriptor catalogSd =
new com.amazonaws.services.glue.model.StorageDescriptor();
catalogSd.setNumberOfBuckets(hiveSd.getNumBuckets());
catalogSd.setCompressed(hiveSd.isCompressed());
catalogSd.setParameters(hiveSd.getParameters());
catalogSd.setBucketColumns(hiveSd.getBucketCols());
catalogSd.setColumns(convertFieldSchemaList(hiveSd.getCols()));
catalogSd.setInputFormat(hiveSd.getInputFormat());
catalogSd.setLocation(hiveSd.getLocation());
catalogSd.setOutputFormat(hiveSd.getOutputFormat());
catalogSd.setSerdeInfo(convertSerDeInfo(hiveSd.getSerdeInfo()));
catalogSd.setSkewedInfo(convertSkewedInfo(hiveSd.getSkewedInfo()));
catalogSd.setSortColumns(convertOrderList(hiveSd.getSortCols()));
catalogSd.setStoredAsSubDirectories(hiveSd.isStoredAsSubDirectories());
return catalogSd;
}
public static com.amazonaws.services.glue.model.Column convertFieldSchema(
FieldSchema hiveFieldSchema) {
com.amazonaws.services.glue.model.Column catalogFieldSchema =
new com.amazonaws.services.glue.model.Column();
catalogFieldSchema.setComment(hiveFieldSchema.getComment());
catalogFieldSchema.setName(hiveFieldSchema.getName());
catalogFieldSchema.setType(hiveFieldSchema.getType());
return catalogFieldSchema;
}
public static List<com.amazonaws.services.glue.model.Column> convertFieldSchemaList(
List<FieldSchema> hiveFieldSchemaList) {
List<com.amazonaws.services.glue.model.Column> catalogFieldSchemaList =
new ArrayList<com.amazonaws.services.glue.model.Column>();
for (FieldSchema hiveFs : hiveFieldSchemaList){
catalogFieldSchemaList.add(convertFieldSchema(hiveFs));
}
return catalogFieldSchemaList;
}
public static com.amazonaws.services.glue.model.SerDeInfo convertSerDeInfo(
SerDeInfo hiveSerDeInfo) {
com.amazonaws.services.glue.model.SerDeInfo catalogSerDeInfo = new com.amazonaws.services.glue.model.SerDeInfo();
catalogSerDeInfo.setName(hiveSerDeInfo.getName());
catalogSerDeInfo.setParameters(hiveSerDeInfo.getParameters());
catalogSerDeInfo.setSerializationLibrary(hiveSerDeInfo.getSerializationLib());
return catalogSerDeInfo;
}
public static com.amazonaws.services.glue.model.SkewedInfo convertSkewedInfo(SkewedInfo hiveSkewedInfo) {
if (hiveSkewedInfo == null)
return null;
com.amazonaws.services.glue.model.SkewedInfo catalogSkewedInfo = new com.amazonaws.services.glue.model.SkewedInfo()
.withSkewedColumnNames(hiveSkewedInfo.getSkewedColNames())
.withSkewedColumnValues(convertSkewedValue(hiveSkewedInfo.getSkewedColValues()))
.withSkewedColumnValueLocationMaps(convertSkewedMap(hiveSkewedInfo.getSkewedColValueLocationMaps()));
return catalogSkewedInfo;
}
public static com.amazonaws.services.glue.model.Order convertOrder(Order hiveOrder) {
com.amazonaws.services.glue.model.Order order = new com.amazonaws.services.glue.model.Order();
order.setColumn(hiveOrder.getCol());
order.setSortOrder(hiveOrder.getOrder());
return order;
}
public static List<com.amazonaws.services.glue.model.Order> convertOrderList(List<Order> hiveOrderList) {
if (hiveOrderList == null) {
return null;
}
List<com.amazonaws.services.glue.model.Order> catalogOrderList = new ArrayList<>();
for (Order hiveOrder : hiveOrderList) {
catalogOrderList.add(convertOrder(hiveOrder));
}
return catalogOrderList;
}
public static com.amazonaws.services.glue.model.Partition convertPartition(Partition src) {
com.amazonaws.services.glue.model.Partition tgt = new com.amazonaws.services.glue.model.Partition();
tgt.setDatabaseName(src.getDbName());
tgt.setTableName(src.getTableName());
tgt.setCreationTime(new Date((long) src.getCreateTime() * 1000));
tgt.setLastAccessTime(new Date((long) src.getLastAccessTime() * 1000));
tgt.setParameters(src.getParameters());
tgt.setStorageDescriptor(convertStorageDescriptor(src.getSd()));
tgt.setValues(src.getValues());
return tgt;
}
public static String convertListToString(final List<String> list) {
if (list == null) {
return null;
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < list.size(); i++) {
String currentString = list.get(i);
sb.append(currentString.length() + "$" + currentString);
}
return sb.toString();
}
public static Map<String, String> convertSkewedMap(final Map<List<String>, String> coreSkewedMap){
if (coreSkewedMap == null){
return null;
}
Map<String, String> catalogSkewedMap = new HashMap<>();
for (List<String> coreKey : coreSkewedMap.keySet()) {
catalogSkewedMap.put(convertListToString(coreKey), coreSkewedMap.get(coreKey));
}
return catalogSkewedMap;
}
public static List<String> convertSkewedValue(final List<List<String>> coreSkewedValue) {
if (coreSkewedValue == null) {
return null;
}
List<String> catalogSkewedValue = new ArrayList<>();
for (int i = 0; i < coreSkewedValue.size(); i++) {
catalogSkewedValue.add(convertListToString(coreSkewedValue.get(i)));
}
return catalogSkewedValue;
}
public static com.amazonaws.services.glue.model.UserDefinedFunction convertFunction(final Function hiveFunction) {
if (hiveFunction == null ){
return null;
}
com.amazonaws.services.glue.model.UserDefinedFunction catalogFunction = new com.amazonaws.services.glue.model.UserDefinedFunction();
catalogFunction.setClassName(hiveFunction.getClassName());
catalogFunction.setFunctionName(hiveFunction.getFunctionName());
catalogFunction.setCreateTime(new Date((long) (hiveFunction.getCreateTime()) * 1000));
catalogFunction.setOwnerName(hiveFunction.getOwnerName());
if(hiveFunction.getOwnerType() != null) {
catalogFunction.setOwnerType(hiveFunction.getOwnerType().name());
}
catalogFunction.setResourceUris(covertResourceUriList(hiveFunction.getResourceUris()));
return catalogFunction;
}
public static List<com.amazonaws.services.glue.model.ResourceUri> covertResourceUriList(
final List<ResourceUri> hiveResourceUriList) {
if (hiveResourceUriList == null) {
return null;
}
List<com.amazonaws.services.glue.model.ResourceUri> catalogResourceUriList = new ArrayList<>();
for (ResourceUri hiveResourceUri : hiveResourceUriList) {
com.amazonaws.services.glue.model.ResourceUri catalogResourceUri = new com.amazonaws.services.glue.model.ResourceUri();
catalogResourceUri.setUri(hiveResourceUri.getUri());
if (hiveResourceUri.getResourceType() != null) {
catalogResourceUri.setResourceType(hiveResourceUri.getResourceType().name());
}
catalogResourceUriList.add(catalogResourceUri);
}
return catalogResourceUriList;
}
public static List<com.amazonaws.services.glue.model.ColumnStatistics> convertColumnStatisticsObjList(
ColumnStatistics hiveColumnStatistics) {
ColumnStatisticsDesc hiveColumnStatisticsDesc = hiveColumnStatistics.getStatsDesc();
List<ColumnStatisticsObj> hiveColumnStatisticsObjs = hiveColumnStatistics.getStatsObj();
List<com.amazonaws.services.glue.model.ColumnStatistics> catalogColumnStatisticsList = new ArrayList<>();
for (ColumnStatisticsObj hiveColumnStatisticsObj : hiveColumnStatisticsObjs) {
com.amazonaws.services.glue.model.ColumnStatistics catalogColumnStatistics =
new com.amazonaws.services.glue.model.ColumnStatistics();
catalogColumnStatistics.setColumnName(hiveColumnStatisticsObj.getColName());
catalogColumnStatistics.setColumnType(hiveColumnStatisticsObj.getColType());
// Last analyzed time in Hive is in days since Epoch, Java Date is in milliseconds
catalogColumnStatistics.setAnalyzedTime(new Date(TimeUnit.DAYS.toMillis(hiveColumnStatisticsDesc.getLastAnalyzed())));
catalogColumnStatistics.setStatisticsData(convertColumnStatisticsData(hiveColumnStatisticsObj.getStatsData()));
catalogColumnStatisticsList.add(catalogColumnStatistics);
}
return catalogColumnStatisticsList;
}
private static com.amazonaws.services.glue.model.ColumnStatisticsData convertColumnStatisticsData(
ColumnStatisticsData hiveColumnStatisticsData) {
com.amazonaws.services.glue.model.ColumnStatisticsData catalogColumnStatisticsData =
new com.amazonaws.services.glue.model.ColumnStatisticsData();
// Hive uses the TUnion object to ensure that only one stats object is set at any time, this means that we can
// only call the get*() of a stats type if the 'setField' is set to that value
ColumnStatisticsData._Fields setField = hiveColumnStatisticsData.getSetField();
switch (setField) {
case BINARY_STATS:
BinaryColumnStatsData hiveBinaryData = hiveColumnStatisticsData.getBinaryStats();
BinaryColumnStatisticsData catalogBinaryData = new BinaryColumnStatisticsData();
catalogBinaryData.setNumberOfNulls(hiveBinaryData.getNumNulls());
catalogBinaryData.setMaximumLength(hiveBinaryData.getMaxColLen());
catalogBinaryData.setAverageLength(hiveBinaryData.getAvgColLen());
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.BINARY));
catalogColumnStatisticsData.setBinaryColumnStatisticsData(catalogBinaryData);
break;
case BOOLEAN_STATS:
BooleanColumnStatsData hiveBooleanData = hiveColumnStatisticsData.getBooleanStats();
BooleanColumnStatisticsData catalogBooleanData = new BooleanColumnStatisticsData();
catalogBooleanData.setNumberOfNulls(hiveBooleanData.getNumNulls());
catalogBooleanData.setNumberOfFalses(hiveBooleanData.getNumFalses());
catalogBooleanData.setNumberOfTrues(hiveBooleanData.getNumTrues());
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.BOOLEAN));
catalogColumnStatisticsData.setBooleanColumnStatisticsData(catalogBooleanData);
break;
case DATE_STATS:
DateColumnStatsData hiveDateData = hiveColumnStatisticsData.getDateStats();
DateColumnStatisticsData catalogDateData = new DateColumnStatisticsData();
catalogDateData.setNumberOfNulls(hiveDateData.getNumNulls());
catalogDateData.setNumberOfDistinctValues(hiveDateData.getNumDVs());
catalogDateData.setMaximumValue(ConverterUtils.hiveDatetoDate(hiveDateData.getHighValue()));
catalogDateData.setMinimumValue(ConverterUtils.hiveDatetoDate(hiveDateData.getLowValue()));
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.DATE));
catalogColumnStatisticsData.setDateColumnStatisticsData(catalogDateData);
break;
case DECIMAL_STATS:
DecimalColumnStatsData hiveDecimalData = hiveColumnStatisticsData.getDecimalStats();
DecimalColumnStatisticsData catalogDecimalData = new DecimalColumnStatisticsData();
catalogDecimalData.setNumberOfNulls(hiveDecimalData.getNumNulls());
catalogDecimalData.setNumberOfDistinctValues(hiveDecimalData.getNumDVs());
catalogDecimalData.setMaximumValue(convertDecimal(hiveDecimalData.getHighValue()));
catalogDecimalData.setMinimumValue(convertDecimal(hiveDecimalData.getLowValue()));
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.DECIMAL));
catalogColumnStatisticsData.setDecimalColumnStatisticsData(catalogDecimalData);
break;
case DOUBLE_STATS:
DoubleColumnStatsData hiveDoubleData = hiveColumnStatisticsData.getDoubleStats();
DoubleColumnStatisticsData catalogDoubleData = new DoubleColumnStatisticsData();
catalogDoubleData.setNumberOfNulls(hiveDoubleData.getNumNulls());
catalogDoubleData.setNumberOfDistinctValues(hiveDoubleData.getNumDVs());
catalogDoubleData.setMaximumValue(hiveDoubleData.getHighValue());
catalogDoubleData.setMinimumValue(hiveDoubleData.getLowValue());
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.DOUBLE));
catalogColumnStatisticsData.setDoubleColumnStatisticsData(catalogDoubleData);
break;
case LONG_STATS:
LongColumnStatsData hiveLongData = hiveColumnStatisticsData.getLongStats();
LongColumnStatisticsData catalogLongData = new LongColumnStatisticsData();
catalogLongData.setNumberOfNulls(hiveLongData.getNumNulls());
catalogLongData.setNumberOfDistinctValues(hiveLongData.getNumDVs());
catalogLongData.setMaximumValue(hiveLongData.getHighValue());
catalogLongData.setMinimumValue(hiveLongData.getLowValue());
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.LONG));
catalogColumnStatisticsData.setLongColumnStatisticsData(catalogLongData);
break;
case STRING_STATS:
StringColumnStatsData hiveStringData = hiveColumnStatisticsData.getStringStats();
StringColumnStatisticsData catalogStringData = new StringColumnStatisticsData();
catalogStringData.setNumberOfNulls(hiveStringData.getNumNulls());
catalogStringData.setNumberOfDistinctValues(hiveStringData.getNumDVs());
catalogStringData.setMaximumLength(hiveStringData.getMaxColLen());
catalogStringData.setAverageLength(hiveStringData.getAvgColLen());
catalogColumnStatisticsData.setType(String.valueOf(ColumnStatisticsType.STRING));
catalogColumnStatisticsData.setStringColumnStatisticsData(catalogStringData);
break;
}
return catalogColumnStatisticsData;
}
private static com.amazonaws.services.glue.model.DecimalNumber convertDecimal(Decimal hiveDecimal) {
com.amazonaws.services.glue.model.DecimalNumber catalogDecimal =
new com.amazonaws.services.glue.model.DecimalNumber();
catalogDecimal.setUnscaledValue(ByteBuffer.wrap(hiveDecimal.getUnscaled()));
catalogDecimal.setScale((int)hiveDecimal.getScale());
return catalogDecimal;
}
}